Implementation notes: amd64, speed2supercop, crypto_core/multsntrup653

Computer: speed2supercop
Microarchitecture: amd64; Haswell+AES (306c3)
Architecture: amd64
CPU ID: GenuineIntel-000306c3-1fc9cbf5
SUPERCOP version: 20240625
Operation: crypto_core
Primitive: multsntrup653
TimeObject sizeTest sizeImplementationCompilerBenchmark dateSUPERCOP version
1626820788 0 035774 792 776avxclang_-march=native_-O2_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1630420572 0 035558 792 776avx800clang_-march=native_-O2_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1650819788 0 034870 792 776avx800clang_-march=native_-O3_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1654420020 0 035102 792 776avxclang_-march=native_-O3_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1718018915 0 032373 752 832avx800gcc_-march=native_-mtune=native_-O3_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1733617937 0 032894 792 776round2clang_-march=native_-O2_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1752817116 0 032166 792 776round2clang_-march=native_-O3_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1770419847 0 030942 792 760avxclang_-march=native_-O_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1771219459 0 032917 752 832avxgcc_-march=native_-mtune=native_-O3_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1794019927 0 031022 792 760avx800clang_-march=native_-O_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1842817744 0 029639 784 856avxclang_-march=native_-Os_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1850017664 0 029551 784 856avx800clang_-march=native_-Os_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1870816469 0 029933 752 832round2gcc_-march=native_-mtune=native_-O3_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1878018038 0 029493 752 832avx800gcc_-march=native_-mtune=native_-O2_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1897216335 0 027398 792 760round2clang_-march=native_-O_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1898818106 0 029324 744 832avx800gcc_-march=native_-mtune=native_-O_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1900817707 0 027972 736 800avx800gcc_-march=native_-mtune=native_-Os_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1924418582 0 030037 752 832avxgcc_-march=native_-mtune=native_-O2_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1931218687 0 029900 744 832avxgcc_-march=native_-mtune=native_-O_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1952418275 0 028540 736 800avxgcc_-march=native_-mtune=native_-Os_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1960813838 0 025703 784 856round2clang_-march=native_-Os_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
2019614758 0 025972 744 832round2gcc_-march=native_-mtune=native_-O_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
2042814643 0 026101 752 832round2gcc_-march=native_-mtune=native_-O2_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
2192813839 0 024108 736 800round2gcc_-march=native_-mtune=native_-Os_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
1424085009 0 018437 752 832refgcc_-march=native_-mtune=native_-O3_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
2112482635 0 017606 792 776refclang_-march=native_-O2_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
2113803163 0 018310 792 776refclang_-march=native_-O3_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
5588842074 0 015790 792 760refclang_-mcpu=native_-O3_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
8127441266 0 012677 752 832refgcc_-march=native_-mtune=native_-O2_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
817024650 0 011702 792 760refclang_-march=native_-O_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
839644641 0 012575 784 856refclang_-march=native_-Os_-fwrapv_-Qunused-arguments_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
861804575 0 011740 744 832refgcc_-march=native_-mtune=native_-O_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625
942312463 0 010668 736 800refgcc_-march=native_-mtune=native_-Os_-fwrapv_-fPIC_-fPIE_-gdwarf-4_-Wall2024070120240625

Compiler output


mult768.c: mult768.c:265:7: error: always_inline function '_mm256_set1_epi16' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_avx_constbranchindex' that is compiled without support for 'avx'
mult768.c:   x = const_x16(0);
mult768.c:       ^
mult768.c: mult768.c:10:19: note: expanded from macro 'const_x16'
mult768.c: #define const_x16 _mm256_set1_epi16
mult768.c:                   ^
mult768.c: mult768.c:265:7: error: AVX vector return of type '__m256i' (vector of 4 'long long' values) without 'avx' enabled changes the ABI
mult768.c: mult768.c:10:19: note: expanded from macro 'const_x16'
mult768.c: #define const_x16 _mm256_set1_epi16
mult768.c:                   ^
mult768.c: mult768.c:266:35: error: always_inline function '_mm256_storeu_si256' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_avx_constbranchindex' that is compiled without support for 'avx'
mult768.c:   for (i = p&~15;i < 768;i += 16) store_x16(&f[i],x);
mult768.c:                                   ^
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c:                        ^
mult768.c: mult768.c:266:35: error: AVX vector argument of type '__m256i' (vector of 4 'long long' values) without 'avx' enabled changes the ABI
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c:                        ^
mult768.c: mult768.c:267:35: error: always_inline function '_mm256_storeu_si256' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_avx_constbranchindex' that is compiled without support for 'avx'
mult768.c:   for (i = p&~15;i < 768;i += 16) store_x16(&g[i],x);
mult768.c:                                   ^
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c: ...

Number of similar (implementation,compiler) pairs: 1, namely:
ImplementationCompiler
avxclang -mcpu=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))

Compiler output


mult768.c: mult768.c:265:7: error: always_inline function '_mm256_set1_epi16' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_avx800_constbranchindex' that is compiled without support for 'avx'
mult768.c:   x = const_x16(0);
mult768.c:       ^
mult768.c: mult768.c:10:19: note: expanded from macro 'const_x16'
mult768.c: #define const_x16 _mm256_set1_epi16
mult768.c:                   ^
mult768.c: mult768.c:265:7: error: AVX vector return of type '__m256i' (vector of 4 'long long' values) without 'avx' enabled changes the ABI
mult768.c: mult768.c:10:19: note: expanded from macro 'const_x16'
mult768.c: #define const_x16 _mm256_set1_epi16
mult768.c:                   ^
mult768.c: mult768.c:266:35: error: always_inline function '_mm256_storeu_si256' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_avx800_constbranchindex' that is compiled without support for 'avx'
mult768.c:   for (i = p&~15;i < 768;i += 16) store_x16(&f[i],x);
mult768.c:                                   ^
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c:                        ^
mult768.c: mult768.c:266:35: error: AVX vector argument of type '__m256i' (vector of 4 'long long' values) without 'avx' enabled changes the ABI
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c:                        ^
mult768.c: mult768.c:267:35: error: always_inline function '_mm256_storeu_si256' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_avx800_constbranchindex' that is compiled without support for 'avx'
mult768.c:   for (i = p&~15;i < 768;i += 16) store_x16(&g[i],x);
mult768.c:                                   ^
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c: ...

Number of similar (implementation,compiler) pairs: 1, namely:
ImplementationCompiler
avx800clang -mcpu=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))

Compiler output


ntt.c: ntt.c:562:35: warning: unused variable 'h0' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                   ^
ntt.c: ntt.c:562:38: warning: unused variable 'h1' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                      ^
ntt.c: ntt.c:562:41: warning: unused variable 'h2' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                         ^
ntt.c: ntt.c:562:44: warning: unused variable 'h3' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                            ^
ntt.c: ntt.c:864:35: warning: unused variable 'h0' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                   ^
ntt.c: ntt.c:864:38: warning: unused variable 'h1' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                      ^
ntt.c: ntt.c:864:41: warning: unused variable 'h2' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                         ^
ntt.c: ntt.c:864:44: warning: unused variable 'h3' [-Wunused-variable]
ntt.c:   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:                                            ^
ntt.c: ntt.c:865:10: warning: unused variable 'origf' [-Wunused-variable]
ntt.c: ...

Number of similar (implementation,compiler) pairs: 4, namely:
ImplementationCompiler
round2clang -march=native -O2 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2clang -march=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2clang -march=native -O -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2clang -march=native -Os -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))

Compiler output


mult768.c: mult768.c:266:7: error: always_inline function '_mm256_set1_epi16' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_round2_constbranchindex' that is compiled without support for 'avx'
mult768.c:   x = const_x16(0);
mult768.c:       ^
mult768.c: mult768.c:10:19: note: expanded from macro 'const_x16'
mult768.c: #define const_x16 _mm256_set1_epi16
mult768.c:                   ^
mult768.c: mult768.c:266:7: error: AVX vector return of type '__m256i' (vector of 4 'long long' values) without 'avx' enabled changes the ABI
mult768.c: mult768.c:10:19: note: expanded from macro 'const_x16'
mult768.c: #define const_x16 _mm256_set1_epi16
mult768.c:                   ^
mult768.c: mult768.c:267:35: error: always_inline function '_mm256_storeu_si256' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_round2_constbranchindex' that is compiled without support for 'avx'
mult768.c:   for (i = p&~15;i < 768;i += 16) store_x16(&f[i],x);
mult768.c:                                   ^
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c:                        ^
mult768.c: mult768.c:267:35: error: AVX vector argument of type '__m256i' (vector of 4 'long long' values) without 'avx' enabled changes the ABI
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c:                        ^
mult768.c: mult768.c:268:35: error: always_inline function '_mm256_storeu_si256' requires target feature 'avx', but would be inlined into function 'crypto_core_multsntrup653_round2_constbranchindex' that is compiled without support for 'avx'
mult768.c:   for (i = p&~15;i < 768;i += 16) store_x16(&g[i],x);
mult768.c:                                   ^
mult768.c: mult768.c:9:24: note: expanded from macro 'store_x16'
mult768.c: #define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v))
mult768.c: ...

Number of similar (implementation,compiler) pairs: 1, namely:
ImplementationCompiler
round2clang -mcpu=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))

Compiler output


ntt.c: ntt.c: In function 'ntt512':
ntt.c: ntt.c:562:44: warning: unused variable 'h3' [-Wunused-variable]
ntt.c:   562 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:       |                                            ^~
ntt.c: ntt.c:562:41: warning: unused variable 'h2' [-Wunused-variable]
ntt.c:   562 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:       |                                         ^~
ntt.c: ntt.c:562:38: warning: unused variable 'h1' [-Wunused-variable]
ntt.c:   562 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:       |                                      ^~
ntt.c: ntt.c:562:35: warning: unused variable 'h0' [-Wunused-variable]
ntt.c:   562 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:       |                                   ^~
ntt.c: ntt.c: In function 'invntt512':
ntt.c: ntt.c:865:10: warning: unused variable 'origf' [-Wunused-variable]
ntt.c:   865 |   int16 *origf = f;
ntt.c:       |          ^~~~~
ntt.c: ntt.c:864:44: warning: unused variable 'h3' [-Wunused-variable]
ntt.c:   864 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:       |                                            ^~
ntt.c: ntt.c:864:41: warning: unused variable 'h2' [-Wunused-variable]
ntt.c:   864 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c:       |                                         ^~
ntt.c: ntt.c:864:38: warning: unused variable 'h1' [-Wunused-variable]
ntt.c:   864 |   __m256i f0,f1,f2,f3,g0,g1,g2,g3,h0,h1,h2,h3;
ntt.c: ...

Number of similar (implementation,compiler) pairs: 4, namely:
ImplementationCompiler
round2gcc -march=native -mtune=native -O2 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2gcc -march=native -mtune=native -O3 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2gcc -march=native -mtune=native -O -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2gcc -march=native -mtune=native -Os -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)

Passed TIMECOP


TIMECOP iterations: 1

Number of similar (implementation,compiler) pairs: 33, namely:
ImplementationCompiler
avxclang -march=native -O2 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avxclang -march=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avxclang -march=native -O -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avxclang -march=native -Os -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avxgcc -march=native -mtune=native -O2 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avxgcc -march=native -mtune=native -O3 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avxgcc -march=native -mtune=native -O -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avxgcc -march=native -mtune=native -Os -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avx800clang -march=native -O2 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avx800clang -march=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avx800clang -march=native -O -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avx800clang -march=native -Os -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
avx800gcc -march=native -mtune=native -O2 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avx800gcc -march=native -mtune=native -O3 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avx800gcc -march=native -mtune=native -O -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
avx800gcc -march=native -mtune=native -Os -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
refclang -march=native -O2 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
refclang -march=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
refclang -march=native -O -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
refclang -march=native -Os -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
refclang -mcpu=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
refgcc -march=native -mtune=native -O2 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
refgcc -march=native -mtune=native -O3 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
refgcc -march=native -mtune=native -O -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
refgcc -march=native -mtune=native -Os -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2clang -march=native -O2 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2clang -march=native -O3 -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2clang -march=native -O -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2clang -march=native -Os -fwrapv -Qunused-arguments -fPIC -fPIE -gdwarf-4 -Wall (Debian_Clang_16.0.6_(27+b1))
round2gcc -march=native -mtune=native -O2 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2gcc -march=native -mtune=native -O3 -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2gcc -march=native -mtune=native -O -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)
round2gcc -march=native -mtune=native -Os -fwrapv -fPIC -fPIE -gdwarf-4 -Wall (13.3.0)