00001
00002
00003
00004
00005 #include "pch.h"
00006
00007 #ifndef CRYPTOPP_GENERATE_X64_MASM
00008
00009 #include "salsa.h"
00010 #include "misc.h"
00011 #include "argnames.h"
00012 #include "cpu.h"
00013
00014 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00015 #include <emmintrin.h>
00016 #endif
00017
00018 NAMESPACE_BEGIN(CryptoPP)
00019
00020 void Salsa20_TestInstantiations()
00021 {
00022 Salsa20::Encryption x;
00023 }
00024
00025 void Salsa20_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length)
00026 {
00027 m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00028
00029 if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00030 throw InvalidRounds(StaticAlgorithmName(), m_rounds);
00031
00032
00033 GetBlock<word32, LittleEndian, false> get1(key);
00034 get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
00035 GetBlock<word32, LittleEndian, false> get2(key + length - 16);
00036 get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
00037
00038
00039 m_state[0] = 0x61707865;
00040 m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
00041 m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
00042 m_state[3] = 0x6b206574;
00043 }
00044
00045 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
00046 {
00047 GetBlock<word32, LittleEndian, false> get(IV);
00048 get(m_state[14])(m_state[11]);
00049 m_state[8] = m_state[5] = 0;
00050 }
00051
00052 void Salsa20_Policy::SeekToIteration(lword iterationCount)
00053 {
00054 m_state[8] = (word32)iterationCount;
00055 m_state[5] = (word32)SafeRightShift<32>(iterationCount);
00056 }
00057
00058 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00059 unsigned int Salsa20_Policy::GetAlignment() const
00060 {
00061 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00062 if (HasSSE2())
00063 return 16;
00064 else
00065 #endif
00066 return 1;
00067 }
00068
00069 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
00070 {
00071 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00072 if (HasSSE2())
00073 return 4*BYTES_PER_ITERATION;
00074 else
00075 #endif
00076 return BYTES_PER_ITERATION;
00077 }
00078 #endif
00079
00080 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00081 extern "C" {
00082 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
00083 }
00084 #endif
00085
00086 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00087
00088 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00089 {
00090 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
00091
00092 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00093 Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
00094 return;
00095 #endif
00096
00097 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00098 #ifdef CRYPTOPP_GENERATE_X64_MASM
00099 ALIGN 8
00100 Salsa20_OperateKeystream PROC FRAME
00101 mov r10, [rsp + 5*8] ; state
00102 alloc_stack(10*16 + 32*16 + 8)
00103 save_xmm128 xmm6, 0200h
00104 save_xmm128 xmm7, 0210h
00105 save_xmm128 xmm8, 0220h
00106 save_xmm128 xmm9, 0230h
00107 save_xmm128 xmm10, 0240h
00108 save_xmm128 xmm11, 0250h
00109 save_xmm128 xmm12, 0260h
00110 save_xmm128 xmm13, 0270h
00111 save_xmm128 xmm14, 0280h
00112 save_xmm128 xmm15, 0290h
00113 .endprolog
00114
00115 #define REG_output rcx
00116 #define REG_input rdx
00117 #define REG_iterationCount r8
00118 #define REG_state r10
00119 #define REG_rounds e9d
00120 #define REG_roundsLeft eax
00121 #define REG_temp32 r11d
00122 #define REG_temp r11
00123 #define SSE2_WORKSPACE rsp
00124 #else
00125 if (HasSSE2())
00126 {
00127 #if CRYPTOPP_BOOL_X64
00128 #define REG_output %4
00129 #define REG_input %1
00130 #define REG_iterationCount %2
00131 #define REG_state %3
00132 #define REG_rounds %0
00133 #define REG_roundsLeft eax
00134 #define REG_temp32 edx
00135 #define REG_temp rdx
00136 #define SSE2_WORKSPACE %5
00137
00138 __m128i workspace[32];
00139 #else
00140 #define REG_output edi
00141 #define REG_input eax
00142 #define REG_iterationCount ecx
00143 #define REG_state esi
00144 #define REG_rounds edx
00145 #define REG_roundsLeft ebx
00146 #define REG_temp32 ebp
00147 #define REG_temp ebp
00148 #define SSE2_WORKSPACE esp + WORD_SZ
00149 #endif
00150
00151 #ifdef __GNUC__
00152 __asm__ __volatile__
00153 (
00154 ".intel_syntax noprefix;"
00155 AS_PUSH_IF86( bx)
00156 #else
00157 void *s = m_state.data();
00158 word32 r = m_rounds;
00159
00160 AS2( mov REG_iterationCount, iterationCount)
00161 AS2( mov REG_input, input)
00162 AS2( mov REG_output, output)
00163 AS2( mov REG_state, s)
00164 AS2( mov REG_rounds, r)
00165 #endif
00166 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00167
00168 AS_PUSH_IF86( bp)
00169 AS2( cmp REG_iterationCount, 4)
00170 ASJ( jl, 5, f)
00171
00172 #if CRYPTOPP_BOOL_X86
00173 AS2( mov ebx, esp)
00174 AS2( and esp, -16)
00175 AS2( sub esp, 32*16)
00176 AS1( push ebx)
00177 #endif
00178
00179 #define SSE2_EXPAND_S(i, j) \
00180 ASS( pshufd xmm4, xmm##i, j, j, j, j) \
00181 AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
00182
00183 AS2( movdqa xmm0, [REG_state + 0*16])
00184 AS2( movdqa xmm1, [REG_state + 1*16])
00185 AS2( movdqa xmm2, [REG_state + 2*16])
00186 AS2( movdqa xmm3, [REG_state + 3*16])
00187 SSE2_EXPAND_S(0, 0)
00188 SSE2_EXPAND_S(0, 1)
00189 SSE2_EXPAND_S(0, 2)
00190 SSE2_EXPAND_S(0, 3)
00191 SSE2_EXPAND_S(1, 0)
00192 SSE2_EXPAND_S(1, 2)
00193 SSE2_EXPAND_S(1, 3)
00194 SSE2_EXPAND_S(2, 1)
00195 SSE2_EXPAND_S(2, 2)
00196 SSE2_EXPAND_S(2, 3)
00197 SSE2_EXPAND_S(3, 0)
00198 SSE2_EXPAND_S(3, 1)
00199 SSE2_EXPAND_S(3, 2)
00200 SSE2_EXPAND_S(3, 3)
00201
00202 #define SSE2_EXPAND_S85(i) \
00203 AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
00204 AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
00205 AS2( add REG_roundsLeft, 1) \
00206 AS2( adc REG_temp32, 0)
00207
00208 ASL(1)
00209 AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
00210 AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
00211 SSE2_EXPAND_S85(0)
00212 SSE2_EXPAND_S85(1)
00213 SSE2_EXPAND_S85(2)
00214 SSE2_EXPAND_S85(3)
00215 AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
00216 AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
00217
00218 #define SSE2_QUARTER_ROUND(a, b, d, i) \
00219 AS2( movdqa xmm4, xmm##d) \
00220 AS2( paddd xmm4, xmm##a) \
00221 AS2( movdqa xmm5, xmm4) \
00222 AS2( pslld xmm4, i) \
00223 AS2( psrld xmm5, 32-i) \
00224 AS2( pxor xmm##b, xmm4) \
00225 AS2( pxor xmm##b, xmm5)
00226
00227 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
00228 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256])
00229 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
00230 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00231 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
00232 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
00233 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
00234 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
00235 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
00236 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00237 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C)
00238 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00239 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
00240 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
00241 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
00242 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
00243 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
00244 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00245 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B)
00246 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
00247 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
00248 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
00249 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
00250 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B)
00251 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
00252 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D)
00253 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
00254 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
00255 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
00256 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C)
00257 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D)
00258 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
00259
00260 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
00261 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
00262 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
00263 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
00264 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
00265 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
00266 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
00267 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
00268 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
00269 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
00270 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
00271 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
00272 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
00273 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
00274 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
00275 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
00276 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
00277 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
00278 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
00279 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
00280 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
00281 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
00282 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
00283 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
00284 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
00285 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
00286 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
00287 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
00288 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
00289 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
00290 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
00291 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
00292 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
00293
00294 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
00295 L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
00296 L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
00297 L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
00298 L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
00299 L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
00300 L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
00301 L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
00302 L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
00303 L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
00304 L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
00305 L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
00306 L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
00307 L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
00308 L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
00309 L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
00310 L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
00311 L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
00312 L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
00313 L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
00314 L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
00315 L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
00316 L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
00317 L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
00318 L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
00319 L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
00320 L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
00321 L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
00322 L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
00323 L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
00324 L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
00325 L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
00326 L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
00327
00328 #if CRYPTOPP_BOOL_X64
00329 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00330 #else
00331 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
00332 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
00333 #endif
00334 AS2( mov REG_roundsLeft, REG_rounds)
00335 ASJ( jmp, 2, f)
00336
00337 ASL(SSE2_Salsa_Output)
00338 AS2( movdqa xmm0, xmm4)
00339 AS2( punpckldq xmm4, xmm5)
00340 AS2( movdqa xmm1, xmm6)
00341 AS2( punpckldq xmm6, xmm7)
00342 AS2( movdqa xmm2, xmm4)
00343 AS2( punpcklqdq xmm4, xmm6)
00344 AS2( punpckhqdq xmm2, xmm6)
00345 AS2( punpckhdq xmm0, xmm5)
00346 AS2( punpckhdq xmm1, xmm7)
00347 AS2( movdqa xmm6, xmm0)
00348 AS2( punpcklqdq xmm0, xmm1)
00349 AS2( punpckhqdq xmm6, xmm1)
00350 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
00351 AS1( ret)
00352
00353 ASL(6)
00354 #if CRYPTOPP_BOOL_X64
00355 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00356 ASL(2)
00357 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
00358 #else
00359 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
00360 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
00361 ASL(2)
00362 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
00363 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
00364 #endif
00365 AS2( sub REG_roundsLeft, 2)
00366 ASJ( jnz, 6, b)
00367
00368 #define SSE2_OUTPUT_4(a, b, c, d) \
00369 AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
00370 AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
00371 AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
00372 AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
00373 AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
00374 AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
00375 AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
00376 AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
00377 ASC( call, SSE2_Salsa_Output)
00378
00379 SSE2_OUTPUT_4(0, 13, 10, 7)
00380 SSE2_OUTPUT_4(4, 1, 14, 11)
00381 SSE2_OUTPUT_4(8, 5, 2, 15)
00382 SSE2_OUTPUT_4(12, 9, 6, 3)
00383 AS2( test REG_input, REG_input)
00384 ASJ( jz, 9, f)
00385 AS2( add REG_input, 12*16)
00386 ASL(9)
00387 AS2( add REG_output, 12*16)
00388 AS2( sub REG_iterationCount, 4)
00389 AS2( cmp REG_iterationCount, 4)
00390 ASJ( jge, 1, b)
00391 AS_POP_IF86( sp)
00392
00393 ASL(5)
00394 AS2( sub REG_iterationCount, 1)
00395 ASJ( jl, 4, f)
00396 AS2( movdqa xmm0, [REG_state + 0*16])
00397 AS2( movdqa xmm1, [REG_state + 1*16])
00398 AS2( movdqa xmm2, [REG_state + 2*16])
00399 AS2( movdqa xmm3, [REG_state + 3*16])
00400 AS2( mov REG_roundsLeft, REG_rounds)
00401
00402 ASL(0)
00403 SSE2_QUARTER_ROUND(0, 1, 3, 7)
00404 SSE2_QUARTER_ROUND(1, 2, 0, 9)
00405 SSE2_QUARTER_ROUND(2, 3, 1, 13)
00406 SSE2_QUARTER_ROUND(3, 0, 2, 18)
00407 ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
00408 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
00409 ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
00410 SSE2_QUARTER_ROUND(0, 3, 1, 7)
00411 SSE2_QUARTER_ROUND(3, 2, 0, 9)
00412 SSE2_QUARTER_ROUND(2, 1, 3, 13)
00413 SSE2_QUARTER_ROUND(1, 0, 2, 18)
00414 ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
00415 ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
00416 ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
00417 AS2( sub REG_roundsLeft, 2)
00418 ASJ( jnz, 0, b)
00419
00420 AS2( paddd xmm0, [REG_state + 0*16])
00421 AS2( paddd xmm1, [REG_state + 1*16])
00422 AS2( paddd xmm2, [REG_state + 2*16])
00423 AS2( paddd xmm3, [REG_state + 3*16])
00424
00425 AS2( add dword ptr [REG_state + 8*4], 1)
00426 AS2( adc dword ptr [REG_state + 5*4], 0)
00427
00428 AS2( pcmpeqb xmm6, xmm6)
00429 AS2( psrlq xmm6, 32)
00430 ASS( pshufd xmm7, xmm6, 0, 1, 2, 3)
00431 AS2( movdqa xmm4, xmm0)
00432 AS2( movdqa xmm5, xmm3)
00433 AS2( pand xmm0, xmm7)
00434 AS2( pand xmm4, xmm6)
00435 AS2( pand xmm3, xmm6)
00436 AS2( pand xmm5, xmm7)
00437 AS2( por xmm4, xmm5)
00438 AS2( movdqa xmm5, xmm1)
00439 AS2( pand xmm1, xmm7)
00440 AS2( pand xmm5, xmm6)
00441 AS2( por xmm0, xmm5)
00442 AS2( pand xmm6, xmm2)
00443 AS2( pand xmm2, xmm7)
00444 AS2( por xmm1, xmm6)
00445 AS2( por xmm2, xmm3)
00446
00447 AS2( movdqa xmm5, xmm4)
00448 AS2( movdqa xmm6, xmm0)
00449 AS3( shufpd xmm4, xmm1, 2)
00450 AS3( shufpd xmm0, xmm2, 2)
00451 AS3( shufpd xmm1, xmm5, 2)
00452 AS3( shufpd xmm2, xmm6, 2)
00453
00454
00455 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
00456 ASJ( jmp, 5, b)
00457 ASL(4)
00458
00459 AS_POP_IF86( bp)
00460 #ifdef __GNUC__
00461 AS_POP_IF86( bx)
00462 ".att_syntax prefix;"
00463 :
00464 #if CRYPTOPP_BOOL_X64
00465 : "r" (m_rounds), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace)
00466 : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
00467 #else
00468 : "d" (m_rounds), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
00469 : "memory", "cc"
00470 #endif
00471 );
00472 #endif
00473 #ifdef CRYPTOPP_GENERATE_X64_MASM
00474 movdqa xmm6, [rsp + 0200h]
00475 movdqa xmm7, [rsp + 0210h]
00476 movdqa xmm8, [rsp + 0220h]
00477 movdqa xmm9, [rsp + 0230h]
00478 movdqa xmm10, [rsp + 0240h]
00479 movdqa xmm11, [rsp + 0250h]
00480 movdqa xmm12, [rsp + 0260h]
00481 movdqa xmm13, [rsp + 0270h]
00482 movdqa xmm14, [rsp + 0280h]
00483 movdqa xmm15, [rsp + 0290h]
00484 add rsp, 10*16 + 32*16 + 8
00485 ret
00486 Salsa20_OperateKeystream ENDP
00487 #else
00488 }
00489 else
00490 #endif
00491 #endif
00492 #ifndef CRYPTOPP_GENERATE_X64_MASM
00493 {
00494 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00495
00496 while (iterationCount--)
00497 {
00498 x0 = m_state[0];
00499 x1 = m_state[1];
00500 x2 = m_state[2];
00501 x3 = m_state[3];
00502 x4 = m_state[4];
00503 x5 = m_state[5];
00504 x6 = m_state[6];
00505 x7 = m_state[7];
00506 x8 = m_state[8];
00507 x9 = m_state[9];
00508 x10 = m_state[10];
00509 x11 = m_state[11];
00510 x12 = m_state[12];
00511 x13 = m_state[13];
00512 x14 = m_state[14];
00513 x15 = m_state[15];
00514
00515 for (int i=m_rounds; i>0; i-=2)
00516 {
00517 #define QUARTER_ROUND(a, b, c, d) \
00518 b = b ^ rotlFixed(a + d, 7); \
00519 c = c ^ rotlFixed(b + a, 9); \
00520 d = d ^ rotlFixed(c + b, 13); \
00521 a = a ^ rotlFixed(d + c, 18);
00522
00523 QUARTER_ROUND(x0, x4, x8, x12)
00524 QUARTER_ROUND(x1, x5, x9, x13)
00525 QUARTER_ROUND(x2, x6, x10, x14)
00526 QUARTER_ROUND(x3, x7, x11, x15)
00527
00528 QUARTER_ROUND(x0, x13, x10, x7)
00529 QUARTER_ROUND(x1, x14, x11, x4)
00530 QUARTER_ROUND(x2, x15, x8, x5)
00531 QUARTER_ROUND(x3, x12, x9, x6)
00532 }
00533
00534 #define SALSA_OUTPUT(x) {\
00535 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
00536 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
00537 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
00538 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
00539 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
00540 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
00541 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
00542 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
00543 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
00544 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
00545 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
00546 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
00547 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
00548 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
00549 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
00550 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
00551
00552 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00553 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
00554 #endif
00555
00556 if (++m_state[8] == 0)
00557 ++m_state[5];
00558 }
00559 }
00560 }
00561
00562 NAMESPACE_END
00563
00564 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM