From d5586dfb9e36642aabb373c952c1017dcff04382 Mon Sep 17 00:00:00 2001
From: Dsplib <admin@dsplib.org>
Date: Tue, 10 Nov 2020 19:06:20 +0300
Subject: [PATCH] improved FFT performance and added verificator for radix-2
 fft from 4 points to 65536

Changes to be committed:
modified:   _release/dspl.c
modified:   _release/dspl.h
modified:   dspl/dox/doxyfile_en
modified:   dspl/dox/footer_en.html
modified:   dspl/dox/header_en.html
modified:   dspl/src/dspl_internal.h
modified:   dspl/src/fft.c
modified:   dspl/src/fft_subkernel.c
modified:   dspl/src/filter_ap.c
modified:   include/dspl.h
modified:   performance/bin/octave/fft_cmplx_performance.m
modified:   performance/src/fft_cmplx_performance.c
new file:   verification/bin/octave/fft_radix2_verification.m
new file:   verification/src/fft_radix2_verification.c
---
 _release/dspl.c                               |   6 +-
 _release/dspl.h                               | 119 +--
 dspl/dox/doxyfile_en                          |  60 +-
 dspl/dox/footer_en.html                       |   7 +-
 dspl/dox/header_en.html                       |   2 +-
 dspl/src/dspl_internal.h                      |  43 +-
 dspl/src/fft.c                                | 204 +++++-
 dspl/src/fft_subkernel.c                      | 679 +++++++++++++++++-
 dspl/src/filter_ap.c                          |   6 -
 include/dspl.h                                |  48 +-
 .../bin/octave/fft_cmplx_performance.m        | 133 ++--
 performance/src/fft_cmplx_performance.c       |   8 +-
 .../bin/octave/fft_radix2_verification.m      |  44 ++
 verification/src/fft_radix2_verification.c    | 156 ++++
 14 files changed, 1369 insertions(+), 146 deletions(-)
 create mode 100644 verification/bin/octave/fft_radix2_verification.m
 create mode 100644 verification/src/fft_radix2_verification.c

diff --git a/_release/dspl.c b/_release/dspl.c
index dfbf7b6..253d23b 100644
--- a/_release/dspl.c
+++ b/_release/dspl.c
@@ -137,8 +137,10 @@ p_matrix_eig_cmplx                      matrix_eig_cmplx              ;
 p_matrix_eye                            matrix_eye                    ;
 p_matrix_eye_cmplx                      matrix_eye_cmplx              ;
 p_matrix_mul                            matrix_mul                    ;
+p_matrix_pinv                           matrix_pinv                   ;
 p_matrix_print                          matrix_print                  ;
 p_matrix_print_cmplx                    matrix_print_cmplx            ;
+p_matrix_svd                            matrix_svd                    ;
 p_matrix_transpose                      matrix_transpose              ;
 p_matrix_transpose_cmplx                matrix_transpose_cmplx        ;
 p_matrix_transpose_hermite              matrix_transpose_hermite      ;
@@ -349,8 +351,10 @@ void* dspl_load()
     LOAD_FUNC(matrix_eye);
     LOAD_FUNC(matrix_eye_cmplx);
     LOAD_FUNC(matrix_mul);
+    LOAD_FUNC(matrix_pinv);
     LOAD_FUNC(matrix_print);
     LOAD_FUNC(matrix_print_cmplx);
+    LOAD_FUNC(matrix_svd);
     LOAD_FUNC(matrix_transpose);
     LOAD_FUNC(matrix_transpose_cmplx);
     LOAD_FUNC(matrix_transpose_hermite);
@@ -440,8 +444,6 @@ void* dspl_load()
 
 
 
-
-
 void dspl_free(void* handle)
 {
     #ifdef WIN_OS
diff --git a/_release/dspl.h b/_release/dspl.h
index 09c4d97..45a0f0f 100644
--- a/_release/dspl.h
+++ b/_release/dspl.h
@@ -225,10 +225,19 @@ www.dsplib.org
 #endif
 typedef struct
 {
-   complex_t*  w;
-   complex_t*  t0;
-   complex_t*  t1;
-   int         n;
+    complex_t*  w;
+    complex_t*  t0;
+    complex_t*  t1;
+   
+    complex_t    w32[ 32];
+    complex_t    w64[ 64];
+    complex_t   w128[128];
+    complex_t   w256[256];
+    complex_t   w512[512];
+    complex_t*  w1024;
+    complex_t*  w2048;
+    complex_t*  w4096;
+    int         n;
 } fft_t;
 
 
@@ -1216,6 +1225,13 @@ DECLARE_FUNC(int,        matrix_mul,                  double*          a
                                                 COMMA int              mb
                                                 COMMA double*          c);
 /*----------------------------------------------------------------------------*/
+DECLARE_FUNC(int,        matrix_pinv,                 double*          a
+                                                COMMA int              n
+                                                COMMA int              m
+                                                COMMA double*          tol
+                                                COMMA double*          inv
+                                                COMMA int*             info);
+/*----------------------------------------------------------------------------*/
 DECLARE_FUNC(int,        matrix_print,                double*          a
                                                 COMMA int              n
                                                 COMMA int              m
@@ -1228,6 +1244,14 @@ DECLARE_FUNC(int,        matrix_print_cmplx,          complex_t*       a
                                                 COMMA const char*      name
                                                 COMMA const char*      format);
 /*----------------------------------------------------------------------------*/
+DECLARE_FUNC(int,        matrix_svd,                  double*          a
+                                                COMMA int              n
+                                                COMMA int              m
+                                                COMMA double*          u
+                                                COMMA double*          s
+                                                COMMA double*          vt
+                                                COMMA int*             info);
+/*----------------------------------------------------------------------------*/
 DECLARE_FUNC(int,        matrix_transpose,            double*          a
                                                 COMMA int              n
                                                 COMMA int              m
@@ -1563,49 +1587,6 @@ DECLARE_FUNC(int,        xcorr_cmplx,                 complex_t*        x
 #endif
 
 
-
-
-
-
-#ifdef DOXYGEN_ENGLISH
-/*! ****************************************************************************
-\ingroup SYS_LOADING_GROUP
-\fn void dspl_free(void* handle)
-\brief Cleans up the previously linked DSPL-2.0 dynamic library.
-
-This cross-platform function clears the library `libdspl.dll` in
-Windows system and from the library `libdspl.so` on the Linux system.
-After cleaning the library, all functions will become unavailable.
-
-\param [in] handle
-Handle of the previously linked DSPL-2.0 library. \n
-This pointer can be `NULL`, in this case no action
-are being produced.
-
-\author Bakhurin Sergey. www.dsplib.org
-***************************************************************************** */
-#endif
-#ifdef DOXYGEN_RUSSIAN
-/*! ****************************************************************************
-\ingroup SYS_LOADING_GROUP
-\fn void dspl_free(void* handle)
-\brief Очищает связанную ранее динамическую библиотеку DSPL-2.0.
-
-Данная кроссплатформенная функция производит очистку библиотеки `libdspl.dll` в 
-системе Windows и с библиотеки `libdspl.so` в системе Linux. 
-После очистки библиотеки все функции станут недоступны.
-
-\param[in] handle
-Хэндл прилинкованной ранее библиотеки DSPL-2.0. \n
-Данный указатель может быть `NULL`, в этом случае никакие действия не 
-производятся.\n\n
-
-\author Бахурин Сергей. www.dsplib.org
-**************************************************************************** */
-#endif
-void* dspl_load();
-
-
 #ifdef DOXYGEN_ENGLISH
 /*! ****************************************************************************
 \ingroup SYS_LOADING_GROUP
@@ -1694,7 +1675,7 @@ int main(int argc, char* argv[])
     void* hdspl;           // DSPL хэндл
     hdspl = dspl_load();   // Динамическая линковка
     
-    // Проверяем указатель. Если `NULLL`, то линковка прошла неудачно
+    // Проверяем указатель. Если `NULL`, то линковка прошла неудачно
     if(!hdspl)
     {
         printf("libdspl loading error!\n");
@@ -1714,6 +1695,48 @@ int main(int argc, char* argv[])
 \author Бахурин Сергей. www.dsplib.org
 ***************************************************************************** */
 #endif
+void* dspl_load();
+
+
+
+
+
+#ifdef DOXYGEN_ENGLISH
+/*! ****************************************************************************
+\ingroup SYS_LOADING_GROUP
+\fn void dspl_free(void* handle)
+\brief Cleans up the previously linked DSPL-2.0 dynamic library.
+
+This cross-platform function clears the library `libdspl.dll` in
+Windows system and from the library `libdspl.so` on the Linux system.
+After cleaning the library, all functions will become unavailable.
+
+\param [in] handle
+Handle of the previously linked DSPL-2.0 library. \n
+This pointer can be `NULL`, in this case no action
+are being produced.
+
+\author Bakhurin Sergey. www.dsplib.org
+***************************************************************************** */
+#endif
+#ifdef DOXYGEN_RUSSIAN
+/*! ****************************************************************************
+\ingroup SYS_LOADING_GROUP
+\fn void dspl_free(void* handle)
+\brief Очищает связанную ранее динамическую библиотеку DSPL-2.0.
+
+Данная кроссплатформенная функция производит очистку библиотеки `libdspl.dll` в 
+системе Windows и с библиотеки `libdspl.so` в системе Linux. 
+После очистки библиотеки все функции станут недоступны.
+
+\param[in] handle
+Хэндл прилинкованной ранее библиотеки DSPL-2.0. \n
+Данный указатель может быть `NULL`, в этом случае никакие действия не 
+производятся.\n\n
+
+\author Бахурин Сергей. www.dsplib.org
+**************************************************************************** */
+#endif
 void  dspl_free(void* handle);
 
 
diff --git a/dspl/dox/doxyfile_en b/dspl/dox/doxyfile_en
index 833bae0..ab367a9 100644
--- a/dspl/dox/doxyfile_en
+++ b/dspl/dox/doxyfile_en
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.18
+# Doxyfile 1.8.20
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -227,6 +227,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -449,6 +457,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -553,7 +574,7 @@ INTERNAL_DOCS          = NO
 # names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# (including Cygwin) ands Mac users are advised to set this option to NO.
+# (including Cygwin) and Mac users are advised to set this option to NO.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = NO
@@ -1132,10 +1153,13 @@ CLANG_ASSISTED_PARSING = NO
 CLANG_OPTIONS          =
 
 # If clang assisted parsing is enabled you can provide the clang parser with the
-# path to the compilation database (see:
-# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
-# were built. This is equivalent to specifying the "-p" option to a clang tool,
-# such as clang-check. These options will then be passed to the parser.
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the "-p" option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
 # Note: The availability of this option depends on whether or not doxygen was
 # generated with the -Duse_libclang=ON option for CMake.
 
@@ -1405,7 +1429,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1571,8 +1595,8 @@ EXT_LINKS_IN_WINDOW    = NO
 # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
 # https://inkscape.org) to generate formulas as SVG images instead of PNGs for
 # the HTML output. These images will generally look nicer at scaled resolutions.
-# Possible values are: png The default and svg Looks nicer but requires the
-# pdf2svg tool.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
 # The default value is: png.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1613,7 +1637,7 @@ FORMULA_MACROFILE      =
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-USE_MATHJAX            = NO
+USE_MATHJAX            = YES
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
@@ -1623,7 +1647,7 @@ USE_MATHJAX            = NO
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_FORMAT         = SVG
+MATHJAX_FORMAT         = HTML-CSS
 
 # When MathJax is enabled you need to specify the location relative to the HTML
 # output directory using the MATHJAX_RELPATH option. The destination directory
@@ -1636,7 +1660,7 @@ MATHJAX_FORMAT         = SVG
 # The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_RELPATH        = http://dsplib.org/mathjax/latest
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
@@ -1872,9 +1896,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -2113,6 +2139,10 @@ DOCBOOK_PROGRAMLISTING = NO
 
 GENERATE_AUTOGEN_DEF   = NO
 
+#---------------------------------------------------------------------------
+# Configuration options related to Sqlite3 output
+#---------------------------------------------------------------------------
+
 #---------------------------------------------------------------------------
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
diff --git a/dspl/dox/footer_en.html b/dspl/dox/footer_en.html
index e9a0f17..f421ee1 100644
--- a/dspl/dox/footer_en.html
+++ b/dspl/dox/footer_en.html
@@ -27,7 +27,12 @@ $generatedby &#160;<a href="http://www.doxygen.org/index.html">
         <a href="http://en.dsplib.org/forum">Forum</a>
     </li>
   
-  
+    <div style = "float: right;">
+    <!--LiveInternet logo--><a href="//www.liveinternet.ru/click"
+    target="_blank"><img src="//counter.yadro.ru/logo?16.2"
+    title="LiveInternet: Shows the number of views in 24 hours, visitors in 24 hours and today"
+    alt="" style="border:0" width="88" height="31"/></a><!--/LiveInternet-->
+    </div>
   
   </ul>
   <HR>
diff --git a/dspl/dox/header_en.html b/dspl/dox/header_en.html
index e007c29..a5846fe 100644
--- a/dspl/dox/header_en.html
+++ b/dspl/dox/header_en.html
@@ -102,7 +102,7 @@ screen.colorDepth:screen.pixelDepth))+";u"+escape(document.URL)+
     <li><a href="http://en.dsplib.org/dspl">DSPL&ndash;2.0</a></li>
     <li><a href="http://en.dsplib.org/forum">Forum</a></li>
     <li    class = "lang-link">
-    <div   class = "lang-link-en">  <a href="http://ru.dsplib.org/dspl">Русский</a> </div>
+    <div   class = "lang-link-en">  <a href="https://ru.dsplib.org/dspl">Русский</a> </div>
     </li>
   </ul>
 </nav>
diff --git a/dspl/src/dspl_internal.h b/dspl/src/dspl_internal.h
index 3787d64..b8fd1e1 100644
--- a/dspl/src/dspl_internal.h
+++ b/dspl/src/dspl_internal.h
@@ -30,16 +30,22 @@
 /* sqrt(2^31) */
 #define FFT_COMPOSITE_MAX           46340
 
+
+/* FFT kernel */
 int fft_krn(complex_t* t0, complex_t* t1, fft_t* p, int n, int addr);
 
-
+/* DFT 2 points */
 void dft2 (complex_t *x,  complex_t* y);
 
+/* DFT 3 points */
 #define DFT3_W         0.866025403784439
 void dft3 (complex_t *x,  complex_t* y);
 
+/* DFT 4 points */
 void dft4 (complex_t *x,  complex_t* y);
 
+
+/* DFT 5 points */
 #define DFT5_W1       -1.250000000000000
 #define DFT5_W2        0.559016994374947
 #define DFT5_W3        1.538841768587630
@@ -47,6 +53,8 @@ void dft4 (complex_t *x,  complex_t* y);
 #define DFT5_W5        0.363271264002680
 void dft5 (complex_t *x,  complex_t* y);
 
+
+/* DFT 7 points */
 #define DFT7_W1      -1.166666666666666518636930
 #define DFT7_W2       0.790156468525400224045541
 #define DFT7_W3       0.055854267289647742400494
@@ -57,18 +65,49 @@ void dft5 (complex_t *x,  complex_t* y);
 #define DFT7_W8      -0.874842290961656665615465
 void dft7 (complex_t *x,  complex_t* y);
 
+
+/* DFT 8 points */
 #define DFT8_W         0.707106781186548
 void dft8 (complex_t *x,  complex_t* y);
 void transpose2x4(complex_t *x,  complex_t* y);
 void transpose4x2(complex_t *x,  complex_t* y);
 
-
+/* DFT 16 points */
 #define DFT16_W1       0.923879532511287
 #define DFT16_W2       0.382683432365090
 #define DFT16_W3       0.707106781186548
 void dft16 (complex_t *x,  complex_t* y);
 void transpose4x4(complex_t *x,  complex_t* y);
 
+/* DFT 32 points */
+void dft32(complex_t *x, complex_t* y, complex_t* w);
+void transpose8x4(complex_t *x, complex_t* y);
+void transpose4x8(complex_t *x, complex_t* y);
+
+/* DFT 64 points */
+void dft64(complex_t *x, complex_t* y, complex_t* w);
+void transpose8x8(complex_t *x, complex_t* y);
+
+/* DFT 128 points */
+void dft128(complex_t *x, complex_t* y, complex_t* w);
+
+/* DFT 256 points */
+void dft256(complex_t *x, complex_t* y, complex_t* w);
+void transpose16x16(complex_t* x, complex_t* y);
+
+/* DFT 512 points */
+void dft512(complex_t *x, complex_t* y, complex_t* w, complex_t* w32);
+
+/* DFT 1024 points */
+void dft1024(complex_t *x, complex_t* y, complex_t* w, complex_t* w32);
+
+
+/* DFT 2048 points */
+void dft2048(complex_t *x, complex_t* y, complex_t* w, 
+             complex_t* w32, complex_t* w64);
+
+/* DFT 4096 points */
+void dft4096(complex_t *x, complex_t* y, complex_t* w,  complex_t* w256);
 
 /* Window functions */
 int win_bartlett        (double *w, int n, int win_type);
diff --git a/dspl/src/fft.c b/dspl/src/fft.c
index 2ee6bd7..c3b1ec5 100644
--- a/dspl/src/fft.c
+++ b/dspl/src/fft.c
@@ -675,13 +675,21 @@ int fft_krn(complex_t* t0, complex_t* t1, fft_t* p, int n, int addr)
     complex_t tmp;
     
     n1 = 1;
-    if(n%16== 0) { n1 = 16; goto label_size; }
-    if(n%7 == 0) { n1 =  7; goto label_size; }
-    if(n%8 == 0) { n1 =  8; goto label_size; }
-    if(n%5 == 0) { n1 =  5; goto label_size; }
-    if(n%4 == 0) { n1 =  4; goto label_size; }
-    if(n%3 == 0) { n1 =  3; goto label_size; }
-    if(n%2 == 0) { n1 =  2; goto label_size; }
+    if(n % 4096 == 0) { n1 = 4096; goto label_size; }
+    if(n % 2048 == 0) { n1 = 2048; goto label_size; }
+    if(n % 1024 == 0) { n1 = 1024; goto label_size; }
+    if(n %  512 == 0) { n1 =  512; goto label_size; }
+    if(n %  256 == 0) { n1 =  256; goto label_size; }
+    if(n %  128 == 0) { n1 =  128; goto label_size; }
+    if(n %   64 == 0) { n1 =   64; goto label_size; }
+    if(n %   32 == 0) { n1 =   32; goto label_size; }
+    if(n %   16 == 0) { n1 =   16; goto label_size; }
+    if(n %    7 == 0) { n1 =    7; goto label_size; }
+    if(n %    8 == 0) { n1 =    8; goto label_size; }
+    if(n %    5 == 0) { n1 =    5; goto label_size; }
+    if(n %    4 == 0) { n1 =    4; goto label_size; }
+    if(n %    3 == 0) { n1 =    3; goto label_size; }
+    if(n %    2 == 0) { n1 =    2; goto label_size; }
 
 label_size:
     if(n1 == 1)
@@ -709,6 +717,38 @@ label_size:
             matrix_transpose_cmplx(t1, n2, n1, t0);
         }
         
+        if(n1 == 4096)
+            for(k = 0; k < n2; k++)
+                dft4096(t0+4096*k, t1+4096*k, p->w4096, p->w256);
+              
+        if(n1 == 2048)
+            for(k = 0; k < n2; k++)
+                dft2048(t0+2048*k, t1+2048*k, p->w2048, p->w32, p->w64);
+        
+        if(n1 == 1024)
+            for(k = 0; k < n2; k++)
+                dft1024(t0+1024*k, t1+1024*k, p->w1024, p->w32);
+              
+        if(n1 == 512)
+            for(k = 0; k < n2; k++)
+                dft512(t0+512*k, t1+512*k, p->w512, p->w32);
+              
+        if(n1 == 256)
+            for(k = 0; k < n2; k++)
+                dft256(t0+256*k, t1+256*k, p->w256);
+              
+        if(n1 == 128)
+            for(k = 0; k < n2; k++)
+                dft128(t0+128*k, t1+128*k, p->w128);
+              
+        if(n1 == 64)
+            for(k = 0; k < n2; k++)
+                dft64(t0+64*k, t1+64*k, p->w64);
+
+        if(n1 == 32)
+            for(k = 0; k < n2; k++)
+                dft32(t0+32*k, t1+32*k, p->w32);
+        
         if(n1 == 16)
             for(k = 0; k < n2; k++)
                 dft16(t0+16*k, t1+16*k);
@@ -752,6 +792,7 @@ label_size:
             {
                 fft_krn(t1+k*n2, t0+k*n2, p, n2, addr+n);
             }
+            
             matrix_transpose_cmplx(t0, n2, n1, t1);
         }
     }
@@ -903,13 +944,21 @@ int DSPL_API fft_create(fft_t* pfft, int n)
     while(s > 1)
     {
         n2 = 1;
-        if(s%16== 0)  { n2 = 16; goto label_size; }
-        if(s%7 == 0)  { n2 =  7; goto label_size; }
-        if(s%8 == 0)  { n2 =  8; goto label_size; }
-        if(s%5 == 0)  { n2 =  5; goto label_size; }
-        if(s%4 == 0)  { n2 =  4; goto label_size; }
-        if(s%3 == 0)  { n2 =  3; goto label_size; }
-        if(s%2 == 0)  { n2 =  2; goto label_size; }
+        if(s%4096 == 0)  { n2 = 4096; goto label_size; }
+        if(s%2048 == 0)  { n2 = 2048; goto label_size; }
+        if(s%1024 == 0)  { n2 = 1024; goto label_size; }
+        if(s%512  == 0)  { n2 =  512; goto label_size; }
+        if(s%256  == 0)  { n2 =  256; goto label_size; }
+        if(s%128  == 0)  { n2 =  128; goto label_size; }
+        if(s% 64  == 0)  { n2 =   64; goto label_size; }
+        if(s% 32  == 0)  { n2 =   32; goto label_size; }
+        if(s% 16  == 0)  { n2 =   16; goto label_size; }
+        if(s%  7  == 0)  { n2 =    7; goto label_size; }
+        if(s%  8  == 0)  { n2 =    8; goto label_size; }
+        if(s%  5  == 0)  { n2 =    5; goto label_size; }
+        if(s%  4  == 0)  { n2 =    4; goto label_size; }
+        if(s%  3  == 0)  { n2 =    3; goto label_size; }
+        if(s%  2  == 0)  { n2 =    2; goto label_size; }
 
 
 label_size:
@@ -962,6 +1011,123 @@ label_size:
     pfft->t1 = pfft->t1 ? (complex_t*) realloc(pfft->t1, n*sizeof(complex_t)):
                           (complex_t*) malloc(           n*sizeof(complex_t));
     pfft->n = n;
+    
+    /* w32 fill */
+    addr = 0;
+    for(k = 0; k < 4; k++)
+    {
+        for(m = 0; m < 8; m++)
+        {
+            phi = - M_2PI * (double)(k*m) / 32.0;
+            RE(pfft->w32[addr]) = cos(phi);
+            IM(pfft->w32[addr]) = sin(phi);
+            addr++;
+        }
+    }
+    
+    
+    /* w64 fill */
+    addr = 0;
+    for(k = 0; k < 8; k++)
+    {
+        for(m = 0; m < 8; m++)
+        {
+            phi = - M_2PI * (double)(k*m) / 64.0;
+            RE(pfft->w64[addr]) = cos(phi);
+            IM(pfft->w64[addr]) = sin(phi);
+            addr++;
+        }
+    }
+    
+    /* w128 fill */
+    addr = 0;
+    for(k = 0; k < 8; k++)
+    {
+        for(m = 0; m < 16; m++)
+        {
+            phi = - M_2PI * (double)(k*m) / 128.0;
+            RE(pfft->w128[addr]) = cos(phi);
+            IM(pfft->w128[addr]) = sin(phi);
+            addr++;
+        }
+    }
+    
+    /* w256 fill */
+    addr = 0;
+    for(k = 0; k < 16; k++)
+    {
+        for(m = 0; m < 16; m++)
+        {
+            phi = - M_2PI * (double)(k*m) / 256.0;
+            RE(pfft->w256[addr]) = cos(phi);
+            IM(pfft->w256[addr]) = sin(phi);
+            addr++;
+        }
+    }
+    
+    /* w512 fill */
+    addr = 0;
+    for(k = 0; k < 16; k++)
+    {
+        for(m = 0; m < 32; m++)
+        {
+            phi = - M_2PI * (double)(k*m) / 512.0;
+            RE(pfft->w512[addr]) = cos(phi);
+            IM(pfft->w512[addr]) = sin(phi);
+            addr++;
+        }
+    }
+    
+    /* w1024 fill */
+    if(pfft->w1024 == NULL)
+    {
+        pfft->w1024 = (complex_t*) malloc(1024 * sizeof(complex_t));
+        addr = 0;
+        for(k = 0; k < 32; k++)
+        {
+            for(m = 0; m < 32; m++)
+            {
+                phi = - M_2PI * (double)(k*m) / 1024.0;
+                RE(pfft->w1024[addr]) = cos(phi);
+                IM(pfft->w1024[addr]) = sin(phi);
+                addr++;
+            }
+        }
+    }
+    
+    /* w2048 fill */
+    if(pfft->w2048 == NULL)
+    {
+        pfft->w2048= (complex_t*) malloc(2048 * sizeof(complex_t));
+        addr = 0;
+        for(k = 0; k < 32; k++)
+        {
+            for(m = 0; m < 64; m++)
+            {
+                phi = - M_2PI * (double)(k*m) / 2048.0;
+                RE(pfft->w2048[addr]) = cos(phi);
+                IM(pfft->w2048[addr]) = sin(phi);
+                addr++;
+            }
+        }
+    }
+    
+    /* w4096 fill */
+    if(pfft->w4096 == NULL)
+    {
+        pfft->w4096= (complex_t*) malloc(4096 * sizeof(complex_t));
+        addr = 0;
+        for(k = 0; k < 16; k++)
+        {
+            for(m = 0; m < 256; m++)
+            {
+                phi = - M_2PI * (double)(k*m) / 4096.0;
+                RE(pfft->w4096[addr]) = cos(phi);
+                IM(pfft->w4096[addr]) = sin(phi);
+                addr++;
+            }
+        }
+    }
 
     return RES_OK;
 error_proc:
@@ -1016,6 +1182,16 @@ void DSPL_API fft_free(fft_t *pfft)
         free(pfft->t0);
     if(pfft->t1)
         free(pfft->t1);
+      
+    if(pfft->w1024)
+        free(pfft->w1024);
+      
+    if(pfft->w2048)
+        free(pfft->w2048);
+      
+    if(pfft->w4096)
+        free(pfft->w4096);
+      
     memset(pfft, 0, sizeof(fft_t));
 }
 
diff --git a/dspl/src/fft_subkernel.c b/dspl/src/fft_subkernel.c
index 3d59f62..c51cc64 100644
--- a/dspl/src/fft_subkernel.c
+++ b/dspl/src/fft_subkernel.c
@@ -26,10 +26,6 @@
 
 
 
-
-
-
-
 /*******************************************************************************
 2 points DFT
 *******************************************************************************/
@@ -319,7 +315,6 @@ void dft7 (complex_t *x, complex_t* y)
 
     RE(y[6]) = RE(sum[20]) - RE(sum[26]);
     IM(y[6]) = IM(sum[20]) - IM(sum[26]);
-
 }
 
 
@@ -455,6 +450,263 @@ void dft16(complex_t *x, complex_t* y)
 }
 
 
+/*******************************************************************************
+32 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft32(complex_t *x, complex_t* y, complex_t* w)
+{
+    complex_t t0[32];
+    complex_t t1[32];
+
+    int i;
+
+    transpose4x8(x, t0);
+    dft8(t0,    t1);
+    dft8(t0+8,  t1+8);
+    dft8(t0+16, t1+16);
+    dft8(t0+24, t1+24);
+    
+
+    for(i = 0; i < 32; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    transpose8x4(t0, t1);
+
+    for(i = 0; i < 8; i++)
+        dft4(t1 + i*4,    t0 + i*4);
+
+    transpose4x8(t0, y);
+}
+
+
+/*******************************************************************************
+64 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft64(complex_t *x, complex_t* y, complex_t* w)
+{
+    complex_t t0[64];
+    complex_t t1[64];
+
+    int i;
+
+    transpose8x8(x, t0);
+
+    for(i = 0; i < 8; i++)
+        dft8(t0 + i*8,    t1 + i*8);
+
+    for(i = 0; i < 64; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+    
+    transpose8x8(t0, t1);
+    
+    for(i = 0; i < 8; i++)
+        dft8(t1 + i*8,    t0 + i*8);
+    transpose8x8(t0, y);
+}
+
+
+
+/*******************************************************************************
+256 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft128(complex_t *x, complex_t* y, complex_t* w)
+{
+    complex_t t0[128];
+    complex_t t1[128];
+
+    int i;
+
+    matrix_transpose_cmplx(x,8,16,t0);
+
+    for(i = 0; i < 8; i++)
+        dft16(t0 + i*16,    t1 + i*16);
+
+    for(i = 0; i < 128; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    matrix_transpose_cmplx(t0, 16, 8, t1);
+
+    for(i = 0; i < 16; i++)
+        dft8(t1 + i*8,    t0 + i*8);
+
+    matrix_transpose_cmplx(t0, 8, 16, y);
+}
+
+
+
+/*******************************************************************************
+256 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft256(complex_t *x, complex_t* y, complex_t* w)
+{
+    complex_t t0[256];
+    complex_t t1[256];
+
+    int i;
+
+    transpose16x16(x, t0);
+
+    for(i = 0; i < 16; i++)
+        dft16(t0 + i*16,    t1 + i*16);
+
+    for(i = 0; i < 256; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    transpose16x16(t0, t1);
+
+    for(i = 0; i < 16; i++)
+        dft16(t1 + i*16,    t0 + i*16);
+
+    transpose16x16(t0, y);
+}
+
+
+/*******************************************************************************
+512 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft512(complex_t *x, complex_t* y, complex_t* w, complex_t* w32)
+{
+    complex_t t0[512];
+    complex_t t1[512];
+
+    int i;
+
+    matrix_transpose_cmplx(x,16,32,t0);
+
+    for(i = 0; i < 16; i++)
+        dft32(t0 + i*32,    t1 + i*32, w32);
+
+    for(i = 0; i < 512; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    matrix_transpose_cmplx(t0, 32, 16, t1);
+
+    for(i = 0; i < 32; i++)
+        dft16(t1 + i*16,    t0 + i*16);
+
+    matrix_transpose_cmplx(t0, 16, 32, y);
+}
+
+
+/*******************************************************************************
+1024 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft1024(complex_t *x, complex_t* y, complex_t* w, complex_t* w32)
+{
+    complex_t t0[1024];
+    complex_t t1[1024];
+
+    int i;
+
+    matrix_transpose_cmplx(x,32,32,t0);
+
+    for(i = 0; i < 32; i++)
+        dft32(t0 + i*32,    t1 + i*32, w32);
+
+    for(i = 0; i < 1024; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    matrix_transpose_cmplx(t0, 32, 32, t1);
+
+    for(i = 0; i < 32; i++)
+        dft32(t1 + i*32,    t0 + i*32, w32);
+
+    matrix_transpose_cmplx(t0, 32, 32, y);
+}
+
+/*******************************************************************************
+2048 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft2048(complex_t *x, complex_t* y, complex_t* w, 
+             complex_t* w32, complex_t* w64)
+{
+    complex_t *t0 = NULL;
+    complex_t *t1 = NULL;
+
+    int i;
+    
+    t0 = (complex_t*)malloc(2048*sizeof(complex_t));
+    t1 = (complex_t*)malloc(2048*sizeof(complex_t));
+    
+    matrix_transpose_cmplx(x,32,64,t0);
+
+    for(i = 0; i < 32; i++)
+        dft64(t0 + i*64,    t1 + i*64, w64);
+
+    for(i = 0; i < 2048; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    matrix_transpose_cmplx(t0, 64, 32, t1);
+
+    for(i = 0; i < 64; i++)
+        dft32(t1 + i*32,    t0 + i*32, w32);
+
+    matrix_transpose_cmplx(t0, 32, 64, y);
+    
+    free(t0);
+    free(t1);
+}
+
+
+
+/*******************************************************************************
+4096 points DFT (Winograd algorithm)
+*******************************************************************************/
+void dft4096(complex_t *x, complex_t* y, complex_t* w,  complex_t* w256)
+{
+    complex_t *t0 = NULL;
+    complex_t *t1 = NULL;
+
+    int i;
+    
+    t0 = (complex_t*)malloc(4096*sizeof(complex_t));
+    t1 = (complex_t*)malloc(4096*sizeof(complex_t));
+    
+    matrix_transpose_cmplx(x,16,256,t0);
+
+    for(i = 0; i < 16; i++)
+        dft256(t0 + i*256,    t1 + i*256, w256);
+
+    for(i = 0; i < 4096; i++)
+    {
+        RE(t0[i]) = CMRE(t1[i], w[i]); 
+        IM(t0[i]) = CMIM(t1[i], w[i]); 
+    }
+
+    matrix_transpose_cmplx(t0, 256, 16, t1);
+
+    for(i = 0; i < 256; i++)
+        dft16(t1 + i*16,    t0 + i*16);
+
+    matrix_transpose_cmplx(t0, 16, 256, y);
+    
+    free(t0);
+    free(t1);
+}
+
+
+
 /*******************************************************************************
 4 x 2 matrix transpose
 *******************************************************************************/
@@ -512,3 +764,420 @@ void transpose4x4(complex_t *x, complex_t* y)
 }
 
 
+/*******************************************************************************
+8 x 4 matrix transpose
+*******************************************************************************/
+void transpose8x4(complex_t *x, complex_t* y)
+{
+    RE(y[ 0]) = RE(x[  0]);    IM(y[ 0]) = IM(x[  0]);
+    RE(y[ 1]) = RE(x[  8]);    IM(y[ 1]) = IM(x[  8]);
+    RE(y[ 2]) = RE(x[ 16]);    IM(y[ 2]) = IM(x[ 16]);
+    RE(y[ 3]) = RE(x[ 24]);    IM(y[ 3]) = IM(x[ 24]);
+    RE(y[ 4]) = RE(x[  1]);    IM(y[ 4]) = IM(x[  1]);
+    RE(y[ 5]) = RE(x[  9]);    IM(y[ 5]) = IM(x[  9]);
+    RE(y[ 6]) = RE(x[ 17]);    IM(y[ 6]) = IM(x[ 17]);
+    RE(y[ 7]) = RE(x[ 25]);    IM(y[ 7]) = IM(x[ 25]);
+    RE(y[ 8]) = RE(x[  2]);    IM(y[ 8]) = IM(x[  2]);
+    RE(y[ 9]) = RE(x[ 10]);    IM(y[ 9]) = IM(x[ 10]);
+    RE(y[10]) = RE(x[ 18]);    IM(y[10]) = IM(x[ 18]);
+    RE(y[11]) = RE(x[ 26]);    IM(y[11]) = IM(x[ 26]);
+    RE(y[12]) = RE(x[  3]);    IM(y[12]) = IM(x[  3]);
+    RE(y[13]) = RE(x[ 11]);    IM(y[13]) = IM(x[ 11]);
+    RE(y[14]) = RE(x[ 19]);    IM(y[14]) = IM(x[ 19]);
+    RE(y[15]) = RE(x[ 27]);    IM(y[15]) = IM(x[ 27]);
+    RE(y[16]) = RE(x[  4]);    IM(y[16]) = IM(x[  4]);
+    RE(y[17]) = RE(x[ 12]);    IM(y[17]) = IM(x[ 12]);
+    RE(y[18]) = RE(x[ 20]);    IM(y[18]) = IM(x[ 20]);
+    RE(y[19]) = RE(x[ 28]);    IM(y[19]) = IM(x[ 28]);
+    RE(y[20]) = RE(x[  5]);    IM(y[20]) = IM(x[  5]);
+    RE(y[21]) = RE(x[ 13]);    IM(y[21]) = IM(x[ 13]);
+    RE(y[22]) = RE(x[ 21]);    IM(y[22]) = IM(x[ 21]);
+    RE(y[23]) = RE(x[ 29]);    IM(y[23]) = IM(x[ 29]);
+    RE(y[24]) = RE(x[  6]);    IM(y[24]) = IM(x[  6]);
+    RE(y[25]) = RE(x[ 14]);    IM(y[25]) = IM(x[ 14]);
+    RE(y[26]) = RE(x[ 22]);    IM(y[26]) = IM(x[ 22]);
+    RE(y[27]) = RE(x[ 30]);    IM(y[27]) = IM(x[ 30]);
+    RE(y[28]) = RE(x[  7]);    IM(y[28]) = IM(x[  7]);
+    RE(y[29]) = RE(x[ 15]);    IM(y[29]) = IM(x[ 15]);
+    RE(y[30]) = RE(x[ 23]);    IM(y[30]) = IM(x[ 23]);
+    RE(y[31]) = RE(x[ 31]);    IM(y[31]) = IM(x[ 31]);
+}
+
+
+/*******************************************************************************
+4 x 8 matrix transpose
+*******************************************************************************/
+void transpose4x8(complex_t *x, complex_t* y)
+{
+    RE(y[ 0]) = RE(x[  0]);    IM(y[ 0]) = IM(x[  0]);
+    RE(y[ 1]) = RE(x[  4]);    IM(y[ 1]) = IM(x[  4]);
+    RE(y[ 2]) = RE(x[  8]);    IM(y[ 2]) = IM(x[  8]);
+    RE(y[ 3]) = RE(x[ 12]);    IM(y[ 3]) = IM(x[ 12]);
+    RE(y[ 4]) = RE(x[ 16]);    IM(y[ 4]) = IM(x[ 16]);
+    RE(y[ 5]) = RE(x[ 20]);    IM(y[ 5]) = IM(x[ 20]);
+    RE(y[ 6]) = RE(x[ 24]);    IM(y[ 6]) = IM(x[ 24]);
+    RE(y[ 7]) = RE(x[ 28]);    IM(y[ 7]) = IM(x[ 28]);
+    RE(y[ 8]) = RE(x[  1]);    IM(y[ 8]) = IM(x[  1]);
+    RE(y[ 9]) = RE(x[  5]);    IM(y[ 9]) = IM(x[  5]);
+    RE(y[10]) = RE(x[  9]);    IM(y[10]) = IM(x[  9]);
+    RE(y[11]) = RE(x[ 13]);    IM(y[11]) = IM(x[ 13]);
+    RE(y[12]) = RE(x[ 17]);    IM(y[12]) = IM(x[ 17]);
+    RE(y[13]) = RE(x[ 21]);    IM(y[13]) = IM(x[ 21]);
+    RE(y[14]) = RE(x[ 25]);    IM(y[14]) = IM(x[ 25]);
+    RE(y[15]) = RE(x[ 29]);    IM(y[15]) = IM(x[ 29]);
+    RE(y[16]) = RE(x[  2]);    IM(y[16]) = IM(x[  2]);
+    RE(y[17]) = RE(x[  6]);    IM(y[17]) = IM(x[  6]);
+    RE(y[18]) = RE(x[ 10]);    IM(y[18]) = IM(x[ 10]);
+    RE(y[19]) = RE(x[ 14]);    IM(y[19]) = IM(x[ 14]);
+    RE(y[20]) = RE(x[ 18]);    IM(y[20]) = IM(x[ 18]);
+    RE(y[21]) = RE(x[ 22]);    IM(y[21]) = IM(x[ 22]);
+    RE(y[22]) = RE(x[ 26]);    IM(y[22]) = IM(x[ 26]);
+    RE(y[23]) = RE(x[ 30]);    IM(y[23]) = IM(x[ 30]);
+    RE(y[24]) = RE(x[  3]);    IM(y[24]) = IM(x[  3]);
+    RE(y[25]) = RE(x[  7]);    IM(y[25]) = IM(x[  7]);
+    RE(y[26]) = RE(x[ 11]);    IM(y[26]) = IM(x[ 11]);
+    RE(y[27]) = RE(x[ 15]);    IM(y[27]) = IM(x[ 15]);
+    RE(y[28]) = RE(x[ 19]);    IM(y[28]) = IM(x[ 19]);
+    RE(y[29]) = RE(x[ 23]);    IM(y[29]) = IM(x[ 23]);
+    RE(y[30]) = RE(x[ 27]);    IM(y[30]) = IM(x[ 27]);
+    RE(y[31]) = RE(x[ 31]);    IM(y[31]) = IM(x[ 31]);
+}
+
+
+/*******************************************************************************
+4 x 8 matrix transpose
+*******************************************************************************/
+void transpose8x8(complex_t *x, complex_t* y)
+{
+    RE(y[  0]) = RE(x[ 0]);    IM(y[  0]) = IM(x[ 0]);
+    RE(y[  1]) = RE(x[ 8]);    IM(y[  1]) = IM(x[ 8]);
+    RE(y[  2]) = RE(x[16]);    IM(y[  2]) = IM(x[16]);
+    RE(y[  3]) = RE(x[24]);    IM(y[  3]) = IM(x[24]);
+    RE(y[  4]) = RE(x[32]);    IM(y[  4]) = IM(x[32]);
+    RE(y[  5]) = RE(x[40]);    IM(y[  5]) = IM(x[40]);
+    RE(y[  6]) = RE(x[48]);    IM(y[  6]) = IM(x[48]);
+    RE(y[  7]) = RE(x[56]);    IM(y[  7]) = IM(x[56]);
+    RE(y[  8]) = RE(x[ 1]);    IM(y[  8]) = IM(x[ 1]);
+    RE(y[  9]) = RE(x[ 9]);    IM(y[  9]) = IM(x[ 9]);
+    RE(y[ 10]) = RE(x[17]);    IM(y[ 10]) = IM(x[17]);
+    RE(y[ 11]) = RE(x[25]);    IM(y[ 11]) = IM(x[25]);
+    RE(y[ 12]) = RE(x[33]);    IM(y[ 12]) = IM(x[33]);
+    RE(y[ 13]) = RE(x[41]);    IM(y[ 13]) = IM(x[41]);
+    RE(y[ 14]) = RE(x[49]);    IM(y[ 14]) = IM(x[49]);
+    RE(y[ 15]) = RE(x[57]);    IM(y[ 15]) = IM(x[57]);
+    RE(y[ 16]) = RE(x[ 2]);    IM(y[ 16]) = IM(x[ 2]);
+    RE(y[ 17]) = RE(x[10]);    IM(y[ 17]) = IM(x[10]);
+    RE(y[ 18]) = RE(x[18]);    IM(y[ 18]) = IM(x[18]);
+    RE(y[ 19]) = RE(x[26]);    IM(y[ 19]) = IM(x[26]);
+    RE(y[ 20]) = RE(x[34]);    IM(y[ 20]) = IM(x[34]);
+    RE(y[ 21]) = RE(x[42]);    IM(y[ 21]) = IM(x[42]);
+    RE(y[ 22]) = RE(x[50]);    IM(y[ 22]) = IM(x[50]);
+    RE(y[ 23]) = RE(x[58]);    IM(y[ 23]) = IM(x[58]);
+    RE(y[ 24]) = RE(x[ 3]);    IM(y[ 24]) = IM(x[ 3]);
+    RE(y[ 25]) = RE(x[11]);    IM(y[ 25]) = IM(x[11]);
+    RE(y[ 26]) = RE(x[19]);    IM(y[ 26]) = IM(x[19]);
+    RE(y[ 27]) = RE(x[27]);    IM(y[ 27]) = IM(x[27]);
+    RE(y[ 28]) = RE(x[35]);    IM(y[ 28]) = IM(x[35]);
+    RE(y[ 29]) = RE(x[43]);    IM(y[ 29]) = IM(x[43]);
+    RE(y[ 30]) = RE(x[51]);    IM(y[ 30]) = IM(x[51]);
+    RE(y[ 31]) = RE(x[59]);    IM(y[ 31]) = IM(x[59]);
+    RE(y[ 32]) = RE(x[ 4]);    IM(y[ 32]) = IM(x[ 4]);
+    RE(y[ 33]) = RE(x[12]);    IM(y[ 33]) = IM(x[12]);
+    RE(y[ 34]) = RE(x[20]);    IM(y[ 34]) = IM(x[20]);
+    RE(y[ 35]) = RE(x[28]);    IM(y[ 35]) = IM(x[28]);
+    RE(y[ 36]) = RE(x[36]);    IM(y[ 36]) = IM(x[36]);
+    RE(y[ 37]) = RE(x[44]);    IM(y[ 37]) = IM(x[44]);
+    RE(y[ 38]) = RE(x[52]);    IM(y[ 38]) = IM(x[52]);
+    RE(y[ 39]) = RE(x[60]);    IM(y[ 39]) = IM(x[60]);
+    RE(y[ 40]) = RE(x[ 5]);    IM(y[ 40]) = IM(x[ 5]);
+    RE(y[ 41]) = RE(x[13]);    IM(y[ 41]) = IM(x[13]);
+    RE(y[ 42]) = RE(x[21]);    IM(y[ 42]) = IM(x[21]);
+    RE(y[ 43]) = RE(x[29]);    IM(y[ 43]) = IM(x[29]);
+    RE(y[ 44]) = RE(x[37]);    IM(y[ 44]) = IM(x[37]);
+    RE(y[ 45]) = RE(x[45]);    IM(y[ 45]) = IM(x[45]);
+    RE(y[ 46]) = RE(x[53]);    IM(y[ 46]) = IM(x[53]);
+    RE(y[ 47]) = RE(x[61]);    IM(y[ 47]) = IM(x[61]);
+    RE(y[ 48]) = RE(x[ 6]);    IM(y[ 48]) = IM(x[ 6]);
+    RE(y[ 49]) = RE(x[14]);    IM(y[ 49]) = IM(x[14]);
+    RE(y[ 50]) = RE(x[22]);    IM(y[ 50]) = IM(x[22]);
+    RE(y[ 51]) = RE(x[30]);    IM(y[ 51]) = IM(x[30]);
+    RE(y[ 52]) = RE(x[38]);    IM(y[ 52]) = IM(x[38]);
+    RE(y[ 53]) = RE(x[46]);    IM(y[ 53]) = IM(x[46]);
+    RE(y[ 54]) = RE(x[54]);    IM(y[ 54]) = IM(x[54]);
+    RE(y[ 55]) = RE(x[62]);    IM(y[ 55]) = IM(x[62]);
+    RE(y[ 56]) = RE(x[ 7]);    IM(y[ 56]) = IM(x[ 7]);
+    RE(y[ 57]) = RE(x[15]);    IM(y[ 57]) = IM(x[15]);
+    RE(y[ 58]) = RE(x[23]);    IM(y[ 58]) = IM(x[23]);
+    RE(y[ 59]) = RE(x[31]);    IM(y[ 59]) = IM(x[31]);
+    RE(y[ 60]) = RE(x[39]);    IM(y[ 60]) = IM(x[39]);
+    RE(y[ 61]) = RE(x[47]);    IM(y[ 61]) = IM(x[47]);
+    RE(y[ 62]) = RE(x[55]);    IM(y[ 62]) = IM(x[55]);
+    RE(y[ 63]) = RE(x[63]);    IM(y[ 63]) = IM(x[63]);
+}
+
+
+
+/*******************************************************************************
+16 x 16 matrix transpose
+*******************************************************************************/
+void transpose16x16(complex_t* x, complex_t* y)
+{
+    RE(y[  0]) = RE(x[  0]);    IM(y[  0]) = IM(x[  0]);
+    RE(y[  1]) = RE(x[ 16]);    IM(y[  1]) = IM(x[ 16]);
+    RE(y[  2]) = RE(x[ 32]);    IM(y[  2]) = IM(x[ 32]);
+    RE(y[  3]) = RE(x[ 48]);    IM(y[  3]) = IM(x[ 48]);
+    RE(y[  4]) = RE(x[ 64]);    IM(y[  4]) = IM(x[ 64]);
+    RE(y[  5]) = RE(x[ 80]);    IM(y[  5]) = IM(x[ 80]);
+    RE(y[  6]) = RE(x[ 96]);    IM(y[  6]) = IM(x[ 96]);
+    RE(y[  7]) = RE(x[112]);    IM(y[  7]) = IM(x[112]);
+    RE(y[  8]) = RE(x[128]);    IM(y[  8]) = IM(x[128]);
+    RE(y[  9]) = RE(x[144]);    IM(y[  9]) = IM(x[144]);
+    RE(y[ 10]) = RE(x[160]);    IM(y[ 10]) = IM(x[160]);
+    RE(y[ 11]) = RE(x[176]);    IM(y[ 11]) = IM(x[176]);
+    RE(y[ 12]) = RE(x[192]);    IM(y[ 12]) = IM(x[192]);
+    RE(y[ 13]) = RE(x[208]);    IM(y[ 13]) = IM(x[208]);
+    RE(y[ 14]) = RE(x[224]);    IM(y[ 14]) = IM(x[224]);
+    RE(y[ 15]) = RE(x[240]);    IM(y[ 15]) = IM(x[240]);
+    RE(y[ 16]) = RE(x[  1]);    IM(y[ 16]) = IM(x[  1]);
+    RE(y[ 17]) = RE(x[ 17]);    IM(y[ 17]) = IM(x[ 17]);
+    RE(y[ 18]) = RE(x[ 33]);    IM(y[ 18]) = IM(x[ 33]);
+    RE(y[ 19]) = RE(x[ 49]);    IM(y[ 19]) = IM(x[ 49]);
+    RE(y[ 20]) = RE(x[ 65]);    IM(y[ 20]) = IM(x[ 65]);
+    RE(y[ 21]) = RE(x[ 81]);    IM(y[ 21]) = IM(x[ 81]);
+    RE(y[ 22]) = RE(x[ 97]);    IM(y[ 22]) = IM(x[ 97]);
+    RE(y[ 23]) = RE(x[113]);    IM(y[ 23]) = IM(x[113]);
+    RE(y[ 24]) = RE(x[129]);    IM(y[ 24]) = IM(x[129]);
+    RE(y[ 25]) = RE(x[145]);    IM(y[ 25]) = IM(x[145]);
+    RE(y[ 26]) = RE(x[161]);    IM(y[ 26]) = IM(x[161]);
+    RE(y[ 27]) = RE(x[177]);    IM(y[ 27]) = IM(x[177]);
+    RE(y[ 28]) = RE(x[193]);    IM(y[ 28]) = IM(x[193]);
+    RE(y[ 29]) = RE(x[209]);    IM(y[ 29]) = IM(x[209]);
+    RE(y[ 30]) = RE(x[225]);    IM(y[ 30]) = IM(x[225]);
+    RE(y[ 31]) = RE(x[241]);    IM(y[ 31]) = IM(x[241]);
+    RE(y[ 32]) = RE(x[  2]);    IM(y[ 32]) = IM(x[  2]);
+    RE(y[ 33]) = RE(x[ 18]);    IM(y[ 33]) = IM(x[ 18]);
+    RE(y[ 34]) = RE(x[ 34]);    IM(y[ 34]) = IM(x[ 34]);
+    RE(y[ 35]) = RE(x[ 50]);    IM(y[ 35]) = IM(x[ 50]);
+    RE(y[ 36]) = RE(x[ 66]);    IM(y[ 36]) = IM(x[ 66]);
+    RE(y[ 37]) = RE(x[ 82]);    IM(y[ 37]) = IM(x[ 82]);
+    RE(y[ 38]) = RE(x[ 98]);    IM(y[ 38]) = IM(x[ 98]);
+    RE(y[ 39]) = RE(x[114]);    IM(y[ 39]) = IM(x[114]);
+    RE(y[ 40]) = RE(x[130]);    IM(y[ 40]) = IM(x[130]);
+    RE(y[ 41]) = RE(x[146]);    IM(y[ 41]) = IM(x[146]);
+    RE(y[ 42]) = RE(x[162]);    IM(y[ 42]) = IM(x[162]);
+    RE(y[ 43]) = RE(x[178]);    IM(y[ 43]) = IM(x[178]);
+    RE(y[ 44]) = RE(x[194]);    IM(y[ 44]) = IM(x[194]);
+    RE(y[ 45]) = RE(x[210]);    IM(y[ 45]) = IM(x[210]);
+    RE(y[ 46]) = RE(x[226]);    IM(y[ 46]) = IM(x[226]);
+    RE(y[ 47]) = RE(x[242]);    IM(y[ 47]) = IM(x[242]);
+    RE(y[ 48]) = RE(x[  3]);    IM(y[ 48]) = IM(x[  3]);
+    RE(y[ 49]) = RE(x[ 19]);    IM(y[ 49]) = IM(x[ 19]);
+    RE(y[ 50]) = RE(x[ 35]);    IM(y[ 50]) = IM(x[ 35]);
+    RE(y[ 51]) = RE(x[ 51]);    IM(y[ 51]) = IM(x[ 51]);
+    RE(y[ 52]) = RE(x[ 67]);    IM(y[ 52]) = IM(x[ 67]);
+    RE(y[ 53]) = RE(x[ 83]);    IM(y[ 53]) = IM(x[ 83]);
+    RE(y[ 54]) = RE(x[ 99]);    IM(y[ 54]) = IM(x[ 99]);
+    RE(y[ 55]) = RE(x[115]);    IM(y[ 55]) = IM(x[115]);
+    RE(y[ 56]) = RE(x[131]);    IM(y[ 56]) = IM(x[131]);
+    RE(y[ 57]) = RE(x[147]);    IM(y[ 57]) = IM(x[147]);
+    RE(y[ 58]) = RE(x[163]);    IM(y[ 58]) = IM(x[163]);
+    RE(y[ 59]) = RE(x[179]);    IM(y[ 59]) = IM(x[179]);
+    RE(y[ 60]) = RE(x[195]);    IM(y[ 60]) = IM(x[195]);
+    RE(y[ 61]) = RE(x[211]);    IM(y[ 61]) = IM(x[211]);
+    RE(y[ 62]) = RE(x[227]);    IM(y[ 62]) = IM(x[227]);
+    RE(y[ 63]) = RE(x[243]);    IM(y[ 63]) = IM(x[243]);
+    RE(y[ 64]) = RE(x[  4]);    IM(y[ 64]) = IM(x[  4]);
+    RE(y[ 65]) = RE(x[ 20]);    IM(y[ 65]) = IM(x[ 20]);
+    RE(y[ 66]) = RE(x[ 36]);    IM(y[ 66]) = IM(x[ 36]);
+    RE(y[ 67]) = RE(x[ 52]);    IM(y[ 67]) = IM(x[ 52]);
+    RE(y[ 68]) = RE(x[ 68]);    IM(y[ 68]) = IM(x[ 68]);
+    RE(y[ 69]) = RE(x[ 84]);    IM(y[ 69]) = IM(x[ 84]);
+    RE(y[ 70]) = RE(x[100]);    IM(y[ 70]) = IM(x[100]);
+    RE(y[ 71]) = RE(x[116]);    IM(y[ 71]) = IM(x[116]);
+    RE(y[ 72]) = RE(x[132]);    IM(y[ 72]) = IM(x[132]);
+    RE(y[ 73]) = RE(x[148]);    IM(y[ 73]) = IM(x[148]);
+    RE(y[ 74]) = RE(x[164]);    IM(y[ 74]) = IM(x[164]);
+    RE(y[ 75]) = RE(x[180]);    IM(y[ 75]) = IM(x[180]);
+    RE(y[ 76]) = RE(x[196]);    IM(y[ 76]) = IM(x[196]);
+    RE(y[ 77]) = RE(x[212]);    IM(y[ 77]) = IM(x[212]);
+    RE(y[ 78]) = RE(x[228]);    IM(y[ 78]) = IM(x[228]);
+    RE(y[ 79]) = RE(x[244]);    IM(y[ 79]) = IM(x[244]);
+    RE(y[ 80]) = RE(x[  5]);    IM(y[ 80]) = IM(x[  5]);
+    RE(y[ 81]) = RE(x[ 21]);    IM(y[ 81]) = IM(x[ 21]);
+    RE(y[ 82]) = RE(x[ 37]);    IM(y[ 82]) = IM(x[ 37]);
+    RE(y[ 83]) = RE(x[ 53]);    IM(y[ 83]) = IM(x[ 53]);
+    RE(y[ 84]) = RE(x[ 69]);    IM(y[ 84]) = IM(x[ 69]);
+    RE(y[ 85]) = RE(x[ 85]);    IM(y[ 85]) = IM(x[ 85]);
+    RE(y[ 86]) = RE(x[101]);    IM(y[ 86]) = IM(x[101]);
+    RE(y[ 87]) = RE(x[117]);    IM(y[ 87]) = IM(x[117]);
+    RE(y[ 88]) = RE(x[133]);    IM(y[ 88]) = IM(x[133]);
+    RE(y[ 89]) = RE(x[149]);    IM(y[ 89]) = IM(x[149]);
+    RE(y[ 90]) = RE(x[165]);    IM(y[ 90]) = IM(x[165]);
+    RE(y[ 91]) = RE(x[181]);    IM(y[ 91]) = IM(x[181]);
+    RE(y[ 92]) = RE(x[197]);    IM(y[ 92]) = IM(x[197]);
+    RE(y[ 93]) = RE(x[213]);    IM(y[ 93]) = IM(x[213]);
+    RE(y[ 94]) = RE(x[229]);    IM(y[ 94]) = IM(x[229]);
+    RE(y[ 95]) = RE(x[245]);    IM(y[ 95]) = IM(x[245]);
+    RE(y[ 96]) = RE(x[  6]);    IM(y[ 96]) = IM(x[  6]);
+    RE(y[ 97]) = RE(x[ 22]);    IM(y[ 97]) = IM(x[ 22]);
+    RE(y[ 98]) = RE(x[ 38]);    IM(y[ 98]) = IM(x[ 38]);
+    RE(y[ 99]) = RE(x[ 54]);    IM(y[ 99]) = IM(x[ 54]);
+    RE(y[100]) = RE(x[ 70]);    IM(y[100]) = IM(x[ 70]);
+    RE(y[101]) = RE(x[ 86]);    IM(y[101]) = IM(x[ 86]);
+    RE(y[102]) = RE(x[102]);    IM(y[102]) = IM(x[102]);
+    RE(y[103]) = RE(x[118]);    IM(y[103]) = IM(x[118]);
+    RE(y[104]) = RE(x[134]);    IM(y[104]) = IM(x[134]);
+    RE(y[105]) = RE(x[150]);    IM(y[105]) = IM(x[150]);
+    RE(y[106]) = RE(x[166]);    IM(y[106]) = IM(x[166]);
+    RE(y[107]) = RE(x[182]);    IM(y[107]) = IM(x[182]);
+    RE(y[108]) = RE(x[198]);    IM(y[108]) = IM(x[198]);
+    RE(y[109]) = RE(x[214]);    IM(y[109]) = IM(x[214]);
+    RE(y[110]) = RE(x[230]);    IM(y[110]) = IM(x[230]);
+    RE(y[111]) = RE(x[246]);    IM(y[111]) = IM(x[246]);
+    RE(y[112]) = RE(x[  7]);    IM(y[112]) = IM(x[  7]);
+    RE(y[113]) = RE(x[ 23]);    IM(y[113]) = IM(x[ 23]);
+    RE(y[114]) = RE(x[ 39]);    IM(y[114]) = IM(x[ 39]);
+    RE(y[115]) = RE(x[ 55]);    IM(y[115]) = IM(x[ 55]);
+    RE(y[116]) = RE(x[ 71]);    IM(y[116]) = IM(x[ 71]);
+    RE(y[117]) = RE(x[ 87]);    IM(y[117]) = IM(x[ 87]);
+    RE(y[118]) = RE(x[103]);    IM(y[118]) = IM(x[103]);
+    RE(y[119]) = RE(x[119]);    IM(y[119]) = IM(x[119]);
+    RE(y[120]) = RE(x[135]);    IM(y[120]) = IM(x[135]);
+    RE(y[121]) = RE(x[151]);    IM(y[121]) = IM(x[151]);
+    RE(y[122]) = RE(x[167]);    IM(y[122]) = IM(x[167]);
+    RE(y[123]) = RE(x[183]);    IM(y[123]) = IM(x[183]);
+    RE(y[124]) = RE(x[199]);    IM(y[124]) = IM(x[199]);
+    RE(y[125]) = RE(x[215]);    IM(y[125]) = IM(x[215]);
+    RE(y[126]) = RE(x[231]);    IM(y[126]) = IM(x[231]);
+    RE(y[127]) = RE(x[247]);    IM(y[127]) = IM(x[247]);
+    RE(y[128]) = RE(x[  8]);    IM(y[128]) = IM(x[  8]);
+    RE(y[129]) = RE(x[ 24]);    IM(y[129]) = IM(x[ 24]);
+    RE(y[130]) = RE(x[ 40]);    IM(y[130]) = IM(x[ 40]);
+    RE(y[131]) = RE(x[ 56]);    IM(y[131]) = IM(x[ 56]);
+    RE(y[132]) = RE(x[ 72]);    IM(y[132]) = IM(x[ 72]);
+    RE(y[133]) = RE(x[ 88]);    IM(y[133]) = IM(x[ 88]);
+    RE(y[134]) = RE(x[104]);    IM(y[134]) = IM(x[104]);
+    RE(y[135]) = RE(x[120]);    IM(y[135]) = IM(x[120]);
+    RE(y[136]) = RE(x[136]);    IM(y[136]) = IM(x[136]);
+    RE(y[137]) = RE(x[152]);    IM(y[137]) = IM(x[152]);
+    RE(y[138]) = RE(x[168]);    IM(y[138]) = IM(x[168]);
+    RE(y[139]) = RE(x[184]);    IM(y[139]) = IM(x[184]);
+    RE(y[140]) = RE(x[200]);    IM(y[140]) = IM(x[200]);
+    RE(y[141]) = RE(x[216]);    IM(y[141]) = IM(x[216]);
+    RE(y[142]) = RE(x[232]);    IM(y[142]) = IM(x[232]);
+    RE(y[143]) = RE(x[248]);    IM(y[143]) = IM(x[248]);
+    RE(y[144]) = RE(x[  9]);    IM(y[144]) = IM(x[  9]);
+    RE(y[145]) = RE(x[ 25]);    IM(y[145]) = IM(x[ 25]);
+    RE(y[146]) = RE(x[ 41]);    IM(y[146]) = IM(x[ 41]);
+    RE(y[147]) = RE(x[ 57]);    IM(y[147]) = IM(x[ 57]);
+    RE(y[148]) = RE(x[ 73]);    IM(y[148]) = IM(x[ 73]);
+    RE(y[149]) = RE(x[ 89]);    IM(y[149]) = IM(x[ 89]);
+    RE(y[150]) = RE(x[105]);    IM(y[150]) = IM(x[105]);
+    RE(y[151]) = RE(x[121]);    IM(y[151]) = IM(x[121]);
+    RE(y[152]) = RE(x[137]);    IM(y[152]) = IM(x[137]);
+    RE(y[153]) = RE(x[153]);    IM(y[153]) = IM(x[153]);
+    RE(y[154]) = RE(x[169]);    IM(y[154]) = IM(x[169]);
+    RE(y[155]) = RE(x[185]);    IM(y[155]) = IM(x[185]);
+    RE(y[156]) = RE(x[201]);    IM(y[156]) = IM(x[201]);
+    RE(y[157]) = RE(x[217]);    IM(y[157]) = IM(x[217]);
+    RE(y[158]) = RE(x[233]);    IM(y[158]) = IM(x[233]);
+    RE(y[159]) = RE(x[249]);    IM(y[159]) = IM(x[249]);
+    RE(y[160]) = RE(x[ 10]);    IM(y[160]) = IM(x[ 10]);
+    RE(y[161]) = RE(x[ 26]);    IM(y[161]) = IM(x[ 26]);
+    RE(y[162]) = RE(x[ 42]);    IM(y[162]) = IM(x[ 42]);
+    RE(y[163]) = RE(x[ 58]);    IM(y[163]) = IM(x[ 58]);
+    RE(y[164]) = RE(x[ 74]);    IM(y[164]) = IM(x[ 74]);
+    RE(y[165]) = RE(x[ 90]);    IM(y[165]) = IM(x[ 90]);
+    RE(y[166]) = RE(x[106]);    IM(y[166]) = IM(x[106]);
+    RE(y[167]) = RE(x[122]);    IM(y[167]) = IM(x[122]);
+    RE(y[168]) = RE(x[138]);    IM(y[168]) = IM(x[138]);
+    RE(y[169]) = RE(x[154]);    IM(y[169]) = IM(x[154]);
+    RE(y[170]) = RE(x[170]);    IM(y[170]) = IM(x[170]);
+    RE(y[171]) = RE(x[186]);    IM(y[171]) = IM(x[186]);
+    RE(y[172]) = RE(x[202]);    IM(y[172]) = IM(x[202]);
+    RE(y[173]) = RE(x[218]);    IM(y[173]) = IM(x[218]);
+    RE(y[174]) = RE(x[234]);    IM(y[174]) = IM(x[234]);
+    RE(y[175]) = RE(x[250]);    IM(y[175]) = IM(x[250]);
+    RE(y[176]) = RE(x[ 11]);    IM(y[176]) = IM(x[ 11]);
+    RE(y[177]) = RE(x[ 27]);    IM(y[177]) = IM(x[ 27]);
+    RE(y[178]) = RE(x[ 43]);    IM(y[178]) = IM(x[ 43]);
+    RE(y[179]) = RE(x[ 59]);    IM(y[179]) = IM(x[ 59]);
+    RE(y[180]) = RE(x[ 75]);    IM(y[180]) = IM(x[ 75]);
+    RE(y[181]) = RE(x[ 91]);    IM(y[181]) = IM(x[ 91]);
+    RE(y[182]) = RE(x[107]);    IM(y[182]) = IM(x[107]);
+    RE(y[183]) = RE(x[123]);    IM(y[183]) = IM(x[123]);
+    RE(y[184]) = RE(x[139]);    IM(y[184]) = IM(x[139]);
+    RE(y[185]) = RE(x[155]);    IM(y[185]) = IM(x[155]);
+    RE(y[186]) = RE(x[171]);    IM(y[186]) = IM(x[171]);
+    RE(y[187]) = RE(x[187]);    IM(y[187]) = IM(x[187]);
+    RE(y[188]) = RE(x[203]);    IM(y[188]) = IM(x[203]);
+    RE(y[189]) = RE(x[219]);    IM(y[189]) = IM(x[219]);
+    RE(y[190]) = RE(x[235]);    IM(y[190]) = IM(x[235]);
+    RE(y[191]) = RE(x[251]);    IM(y[191]) = IM(x[251]);
+    RE(y[192]) = RE(x[ 12]);    IM(y[192]) = IM(x[ 12]);
+    RE(y[193]) = RE(x[ 28]);    IM(y[193]) = IM(x[ 28]);
+    RE(y[194]) = RE(x[ 44]);    IM(y[194]) = IM(x[ 44]);
+    RE(y[195]) = RE(x[ 60]);    IM(y[195]) = IM(x[ 60]);
+    RE(y[196]) = RE(x[ 76]);    IM(y[196]) = IM(x[ 76]);
+    RE(y[197]) = RE(x[ 92]);    IM(y[197]) = IM(x[ 92]);
+    RE(y[198]) = RE(x[108]);    IM(y[198]) = IM(x[108]);
+    RE(y[199]) = RE(x[124]);    IM(y[199]) = IM(x[124]);
+    RE(y[200]) = RE(x[140]);    IM(y[200]) = IM(x[140]);
+    RE(y[201]) = RE(x[156]);    IM(y[201]) = IM(x[156]);
+    RE(y[202]) = RE(x[172]);    IM(y[202]) = IM(x[172]);
+    RE(y[203]) = RE(x[188]);    IM(y[203]) = IM(x[188]);
+    RE(y[204]) = RE(x[204]);    IM(y[204]) = IM(x[204]);
+    RE(y[205]) = RE(x[220]);    IM(y[205]) = IM(x[220]);
+    RE(y[206]) = RE(x[236]);    IM(y[206]) = IM(x[236]);
+    RE(y[207]) = RE(x[252]);    IM(y[207]) = IM(x[252]);
+    RE(y[208]) = RE(x[ 13]);    IM(y[208]) = IM(x[ 13]);
+    RE(y[209]) = RE(x[ 29]);    IM(y[209]) = IM(x[ 29]);
+    RE(y[210]) = RE(x[ 45]);    IM(y[210]) = IM(x[ 45]);
+    RE(y[211]) = RE(x[ 61]);    IM(y[211]) = IM(x[ 61]);
+    RE(y[212]) = RE(x[ 77]);    IM(y[212]) = IM(x[ 77]);
+    RE(y[213]) = RE(x[ 93]);    IM(y[213]) = IM(x[ 93]);
+    RE(y[214]) = RE(x[109]);    IM(y[214]) = IM(x[109]);
+    RE(y[215]) = RE(x[125]);    IM(y[215]) = IM(x[125]);
+    RE(y[216]) = RE(x[141]);    IM(y[216]) = IM(x[141]);
+    RE(y[217]) = RE(x[157]);    IM(y[217]) = IM(x[157]);
+    RE(y[218]) = RE(x[173]);    IM(y[218]) = IM(x[173]);
+    RE(y[219]) = RE(x[189]);    IM(y[219]) = IM(x[189]);
+    RE(y[220]) = RE(x[205]);    IM(y[220]) = IM(x[205]);
+    RE(y[221]) = RE(x[221]);    IM(y[221]) = IM(x[221]);
+    RE(y[222]) = RE(x[237]);    IM(y[222]) = IM(x[237]);
+    RE(y[223]) = RE(x[253]);    IM(y[223]) = IM(x[253]);
+    RE(y[224]) = RE(x[ 14]);    IM(y[224]) = IM(x[ 14]);
+    RE(y[225]) = RE(x[ 30]);    IM(y[225]) = IM(x[ 30]);
+    RE(y[226]) = RE(x[ 46]);    IM(y[226]) = IM(x[ 46]);
+    RE(y[227]) = RE(x[ 62]);    IM(y[227]) = IM(x[ 62]);
+    RE(y[228]) = RE(x[ 78]);    IM(y[228]) = IM(x[ 78]);
+    RE(y[229]) = RE(x[ 94]);    IM(y[229]) = IM(x[ 94]);
+    RE(y[230]) = RE(x[110]);    IM(y[230]) = IM(x[110]);
+    RE(y[231]) = RE(x[126]);    IM(y[231]) = IM(x[126]);
+    RE(y[232]) = RE(x[142]);    IM(y[232]) = IM(x[142]);
+    RE(y[233]) = RE(x[158]);    IM(y[233]) = IM(x[158]);
+    RE(y[234]) = RE(x[174]);    IM(y[234]) = IM(x[174]);
+    RE(y[235]) = RE(x[190]);    IM(y[235]) = IM(x[190]);
+    RE(y[236]) = RE(x[206]);    IM(y[236]) = IM(x[206]);
+    RE(y[237]) = RE(x[222]);    IM(y[237]) = IM(x[222]);
+    RE(y[238]) = RE(x[238]);    IM(y[238]) = IM(x[238]);
+    RE(y[239]) = RE(x[254]);    IM(y[239]) = IM(x[254]);
+    RE(y[240]) = RE(x[ 15]);    IM(y[240]) = IM(x[ 15]);
+    RE(y[241]) = RE(x[ 31]);    IM(y[241]) = IM(x[ 31]);
+    RE(y[242]) = RE(x[ 47]);    IM(y[242]) = IM(x[ 47]);
+    RE(y[243]) = RE(x[ 63]);    IM(y[243]) = IM(x[ 63]);
+    RE(y[244]) = RE(x[ 79]);    IM(y[244]) = IM(x[ 79]);
+    RE(y[245]) = RE(x[ 95]);    IM(y[245]) = IM(x[ 95]);
+    RE(y[246]) = RE(x[111]);    IM(y[246]) = IM(x[111]);
+    RE(y[247]) = RE(x[127]);    IM(y[247]) = IM(x[127]);
+    RE(y[248]) = RE(x[143]);    IM(y[248]) = IM(x[143]);
+    RE(y[249]) = RE(x[159]);    IM(y[249]) = IM(x[159]);
+    RE(y[250]) = RE(x[175]);    IM(y[250]) = IM(x[175]);
+    RE(y[251]) = RE(x[191]);    IM(y[251]) = IM(x[191]);
+    RE(y[252]) = RE(x[207]);    IM(y[252]) = IM(x[207]);
+    RE(y[253]) = RE(x[223]);    IM(y[253]) = IM(x[223]);
+    RE(y[254]) = RE(x[239]);    IM(y[254]) = IM(x[239]);
+    RE(y[255]) = RE(x[255]);    IM(y[255]) = IM(x[255]);  
+}
+
+
diff --git a/dspl/src/filter_ap.c b/dspl/src/filter_ap.c
index d7a2a06..ba6ed1a 100644
--- a/dspl/src/filter_ap.c
+++ b/dspl/src/filter_ap.c
@@ -96,7 +96,6 @@ In addition, GNUPLOT will build the following graphs from data stored in files:
 
 \author Sergey Bakhurin www.dsplib.org
 ***************************************************************************** */
-
 #endif
 #ifdef DOXYGEN_RUSSIAN
 /*! ****************************************************************************
@@ -211,11 +210,6 @@ exit_label:
 
 
 
-
-
-
-
-
 #ifdef DOXYGEN_ENGLISH
 /*! ****************************************************************************
 \ingroup IIR_FILTER_DESIGN_GROUP
diff --git a/include/dspl.h b/include/dspl.h
index 298262b..045dc5f 100644
--- a/include/dspl.h
+++ b/include/dspl.h
@@ -118,7 +118,10 @@ Memory must be allocated by \ref fft_create function. \n\n
 Pointer to the vector of intermediate results. \n
 The size of the vector is `[n x 1]`. \n
 The memory must be allocated with the \ref fft_create function. \n\n
-The structure is populated with the \ref fft_create function once
+
+
+
+The structure is calculated with the \ref fft_create function once
 before using the FFT algorithm. \n
 A pointer to an object of this structure may be
 reused when calling FFT functions. \n
@@ -186,6 +189,32 @@ then the structure arrays will be automatically recreated for the length `n`.
 Указатель на вектор промежуточных вычислений алгоритма БПФ. \n
 Размер вектора `[n x 1]`. \n
 Память должна быть выделена функцией \ref fft_create. \n \n
+
+\param w32
+Статический вектор поворотных коэффициентов 32-точечного БПФ. \n \n
+
+\param w64
+Статический вектор поворотных коэффициентов 64-точечного БПФ. \n \n
+
+\param w128
+Статический вектор поворотных коэффициентов 128-точечного БПФ. \n \n
+
+\param w256
+Статический вектор поворотных коэффициентов 256-точечного БПФ. \n \n
+
+\param w512
+Статический вектор поворотных коэффициентов 512-точечного БПФ. \n \n
+
+\param w1024
+Статический вектор поворотных коэффициентов 1024-точечного БПФ. \n \n
+
+\param w2048
+Статический вектор поворотных коэффициентов 2048-точечного БПФ. \n \n
+
+\param w4096
+Статический вектор поворотных коэффициентов 4096-точечного БПФ. \n \n
+
+
 Структура заполняется функцией \ref fft_create один раз
 до использования алгоритма БПФ.  \n
 Указатель на объект данной структуры может быть
@@ -225,10 +254,19 @@ www.dsplib.org
 #endif
 typedef struct
 {
-   complex_t*  w;
-   complex_t*  t0;
-   complex_t*  t1;
-   int         n;
+    complex_t*  w;
+    complex_t*  t0;
+    complex_t*  t1;
+   
+    complex_t    w32[ 32];
+    complex_t    w64[ 64];
+    complex_t   w128[128];
+    complex_t   w256[256];
+    complex_t   w512[512];
+    complex_t*  w1024;
+    complex_t*  w2048;
+    complex_t*  w4096;
+    int         n;
 } fft_t;
 
 
diff --git a/performance/bin/octave/fft_cmplx_performance.m b/performance/bin/octave/fft_cmplx_performance.m
index 9bab254..acaf5b9 100644
--- a/performance/bin/octave/fft_cmplx_performance.m
+++ b/performance/bin/octave/fft_cmplx_performance.m
@@ -23,49 +23,96 @@ for j = 1:21
   s = s / 2;
 endfor
 
+dspl_size = [2      
+             4      
+             8      
+             16     
+             32     
+             64     
+             128    
+             256    
+             512    
+             1024   
+             2048   
+             4096   
+             8192   
+             16384  
+             32768  
+             65536  
+             131072 
+             262144 
+             524288 
+             1048576];
+             
 
-dspl = [1204.630392
-1283.970612
-1586.347958
-1707.107097
-1866.109831
-1837.307509
-2366.785829
-2302.925874
-2388.456514
-2113.451546
-3090.904615
-2979.596190
-2685.155556
-2053.760000
-3723.946667
-3195.618462
-2328.221538
-1786.533333
-7288.960000
-4646.700000
-2633.120000];
+dspl_mflops = [597.7 
+               1946.0
+               4455.5
+               5446.3
+               4490.7
+               4288.5
+               3524.1
+               5286.9
+               3995.3
+               3657.6
+               2953.2
+               2078.0
+               2565.7
+               2615.5
+               2361.8
+               2376.4
+               2169.8
+               2285.5
+               2172.4
+               1896.4];
 
-python = [2390.741
-2597.527
-2841.191
-3066.652
-3092.187
-3444.710
-3633.320
-4333.845
-5316.897
-5201.486
-4608.231
-4481.357
-3876.925
-2961.753
-2435.427
-1344.871
- 606.953
- 298.559
- 120.772
-  50.369
-  17.033];
+python_size = [4194304
+               2097152
+               1048576
+                524288
+                262144
+                131072
+                 65536
+                 32768
+                 16384
+                  8192
+                  4096
+                  2048
+                  1024
+                   512
+                   256
+                   128
+                    64
+                    32
+                    16
+                     8
+                     4
+                     2];
+                     
+                     
+                     
+                     
+python_mflops = [2119.626
+                 2147.070
+                 2362.656
+                 2351.777
+                 2408.621
+                 2678.743
+                 3194.574
+                 3978.322
+                 5220.731
+                 4671.613
+                 4240.982
+                 3585.080
+                 3876.999
+                 2556.301
+                 2333.780
+                 1301.660
+                  606.947
+                  294.469
+                  127.103
+                   37.574
+                   15.669
+                    4.110];                     
    
-   plot(log2(size), mflops,log2(size), dspl, log2(size), python)
\ No newline at end of file
+   plot(log2(size), mflops,log2(dspl_size), dspl_mflops, log2(python_size), python_mflops)
\ No newline at end of file
diff --git a/performance/src/fft_cmplx_performance.c b/performance/src/fft_cmplx_performance.c
index 71a6a00..f00aff0 100644
--- a/performance/src/fft_cmplx_performance.c
+++ b/performance/src/fft_cmplx_performance.c
@@ -5,7 +5,7 @@
 #include "dspl.h"
 
 #define NMAX 4194304
-#define L 18
+#define L 20
 #define SIZE_FACTOR 2.3 
 
 
@@ -87,10 +87,10 @@ int main(int argc, char* argv[])
     hdspl = dspl_load();    /* Load DSPL function */
     
     int len_r2[L] = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096,
-                    8192, 16384, 32768, 65536, 131072, 262144};
+                    8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576};
                     
-    int len_nr[L] = {6, 9, 12, 15, 18, 24, 36, 80, 108, 210, 504, 1000,
-                    1960, 4725, 10368, 27000, 75600, 165375};
+    int len_nr[L] = {6, 9, 12, 15, 18, 24, 36, 80, 100, 108, 210, 504, 1000,
+                    1960, 4725, 8000, 10368, 27000, 75600, 165375};
 
     int err;
     double   mflops[L] = {0};
diff --git a/verification/bin/octave/fft_radix2_verification.m b/verification/bin/octave/fft_radix2_verification.m
new file mode 100644
index 0000000..95b0e4d
--- /dev/null
+++ b/verification/bin/octave/fft_radix2_verification.m
@@ -0,0 +1,44 @@
+clear all; close all; clc;
+addpath('octave');
+
+
+fn_in = {'dat/x_fft_4.dat';
+         'dat/x_fft_8.dat';
+         'dat/x_fft_16.dat';
+         'dat/x_fft_32.dat';
+         'dat/x_fft_64.dat';
+         'dat/x_fft_128.dat';
+         'dat/x_fft_256.dat';
+         'dat/x_fft_512.dat';
+         'dat/x_fft_1024.dat';
+         'dat/x_fft_2048.dat';
+         'dat/x_fft_4096.dat';
+         'dat/x_fft_8192.dat';
+         'dat/x_fft_16384.dat';
+         'dat/x_fft_32768.dat';
+         'dat/x_fft_65536.dat'};
+
+fn_out = {'dat/y_fft_4.dat';
+          'dat/y_fft_8.dat';
+          'dat/y_fft_16.dat';
+          'dat/y_fft_32.dat';
+          'dat/y_fft_64.dat';
+          'dat/y_fft_128.dat';
+          'dat/y_fft_256.dat';
+          'dat/y_fft_512.dat';
+          'dat/y_fft_1024.dat';
+          'dat/y_fft_2048.dat';
+          'dat/y_fft_4096.dat';
+          'dat/y_fft_8192.dat';
+          'dat/y_fft_16384.dat';
+          'dat/y_fft_32768.dat';
+          'dat/y_fft_65536.dat'};
+
+
+for i = 1:length(fn_in)
+    x = readbin(fn_in{i});
+    y = fft(x);
+    writebin(y, 1, fn_out{i});
+end
+
+
diff --git a/verification/src/fft_radix2_verification.c b/verification/src/fft_radix2_verification.c
new file mode 100644
index 0000000..8faa8ea
--- /dev/null
+++ b/verification/src/fft_radix2_verification.c
@@ -0,0 +1,156 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "dspl.h"
+
+#define FFT_SIZE   65536
+
+
+int main(int argc, char* argv[])
+{
+    void* hdspl;            /* DSPL handle        */
+    fft_t pfft = {0};
+
+    int verr, nx, type;
+    double derr;
+    complex_t *yout = NULL;
+    complex_t *xc   = NULL;
+
+    hdspl = dspl_load();    /* Load DSPL function */
+    
+
+    verif_data_gen(4,     DAT_COMPLEX, "dat/x_fft_4.dat");
+    verif_data_gen(8,     DAT_COMPLEX, "dat/x_fft_8.dat");
+    verif_data_gen(16,    DAT_COMPLEX, "dat/x_fft_16.dat");
+    verif_data_gen(32,    DAT_COMPLEX, "dat/x_fft_32.dat");
+    verif_data_gen(64,    DAT_COMPLEX, "dat/x_fft_64.dat");
+    verif_data_gen(128,   DAT_COMPLEX, "dat/x_fft_128.dat");
+    verif_data_gen(256,   DAT_COMPLEX, "dat/x_fft_256.dat");
+    verif_data_gen(512,   DAT_COMPLEX, "dat/x_fft_512.dat");
+    verif_data_gen(1024,  DAT_COMPLEX, "dat/x_fft_1024.dat");
+    verif_data_gen(2048,  DAT_COMPLEX, "dat/x_fft_2048.dat");
+    verif_data_gen(4096,  DAT_COMPLEX, "dat/x_fft_4096.dat");
+    verif_data_gen(8192,  DAT_COMPLEX, "dat/x_fft_8192.dat");
+    verif_data_gen(16384, DAT_COMPLEX, "dat/x_fft_16384.dat");
+    verif_data_gen(32768, DAT_COMPLEX, "dat/x_fft_32768.dat");
+    verif_data_gen(65536, DAT_COMPLEX, "dat/x_fft_65536.dat");
+    
+    yout = (complex_t*)malloc(FFT_SIZE * sizeof(complex_t  ));
+
+    system("octave octave/fft_radix2_verification.m");
+    
+
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_4.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 4, &pfft, yout);
+    verif_str_cmplx(yout, 4, "fft 4    for complex dat", 
+                             "dat/y_fft_4.dat", 
+                             "verification.log");
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_8.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 8, &pfft, yout);
+    verif_str_cmplx(yout, 8, "fft 8    for complex dat", 
+                             "dat/y_fft_8.dat", 
+                             "verification.log");
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_16.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 16, &pfft, yout);
+    verif_str_cmplx(yout, 16, "fft 16   for complex dat", 
+                             "dat/y_fft_16.dat", 
+                             "verification.log");
+                             
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_32.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 32, &pfft, yout);
+    verif_str_cmplx(yout, 32, "fft 32   for complex dat", 
+                             "dat/y_fft_32.dat", 
+                             "verification.log");
+
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_64.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 64, &pfft, yout);
+    verif_str_cmplx(yout, 64, "fft 64   for complex dat", 
+                             "dat/y_fft_64.dat", 
+                             "verification.log");
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_128.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 128, &pfft, yout);
+    verif_str_cmplx(yout, 128, "fft 128  for complex dat", 
+                             "dat/y_fft_128.dat", 
+                             "verification.log");
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_256.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 256, &pfft, yout);
+    verif_str_cmplx(yout, 256, "fft 256   for complex dat", 
+                             "dat/y_fft_256.dat", 
+                             "verification.log");
+                             
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_512.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 512, &pfft, yout);
+    verif_str_cmplx(yout, 512, "fft 512   for complex dat", 
+                             "dat/y_fft_512.dat", 
+                             "verification.log");
+
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_1024.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 1024, &pfft, yout);
+    verif_str_cmplx(yout, 1024, "fft 1024  for complex dat", 
+                             "dat/y_fft_1024.dat", 
+                             "verification.log");
+                             
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_2048.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 2048, &pfft, yout);
+    verif_str_cmplx(yout, 2048, "fft 2048  for complex dat", 
+                             "dat/y_fft_2048.dat", 
+                             "verification.log");
+                             
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_4096.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 4096, &pfft, yout);
+    verif_str_cmplx(yout, 4096, "fft 4096  for complex dat", 
+                             "dat/y_fft_4096.dat", 
+                             "verification.log");
+  
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_8192.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 8192, &pfft, yout);
+    verif_str_cmplx(yout, 8192, "fft 8192  for complex dat", 
+                             "dat/y_fft_8192.dat", 
+                             "verification.log");
+
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_16384.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 16384, &pfft, yout);
+    verif_str_cmplx(yout, 16384, "fft 16384  for complex dat", 
+                             "dat/y_fft_16384.dat", 
+                             "verification.log");
+
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_32768.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 32768, &pfft, yout);
+    verif_str_cmplx(yout, 32768, "fft 32768  for complex dat", 
+                             "dat/y_fft_32768.dat", 
+                             "verification.log");
+                             
+    /*------------------------------------------------------------------------*/
+    readbin("dat/x_fft_65536.dat", (void**)(&xc), &nx, &type);
+    fft_cmplx(xc, 65536, &pfft, yout);
+    verif_str_cmplx(yout, 65536, "fft 65536  for complex dat", 
+                             "dat/y_fft_65536.dat", 
+                             "verification.log");
+
+
+    /* free dspl handle */
+    dspl_free(hdspl);
+    
+    if(yout)
+        free(yout);
+    if(xc)
+        free(xc);
+
+    return 0;
+}
+