00001
00002
00003
00004
00005 #ifndef VEC_SSE_H
00006 #define VEC_SSE_H
00007
00008 #include "BaseException.h"
00009
00010 namespace FD {
00011
00012 #ifdef _ENABLE_SSE
00013
00014 #define CLOBBER_SSE : "memory"
00015
00016
00017
00018 inline void vec_mul_and_add_sse(const float a, const float *b, float *c, int len)
00019 {
00020 __asm__ __volatile__ (
00021
00022 "\tpush %0\n"
00023 "\tpush %1\n"
00024 "\tpush %2\n"
00025 "\tpush %3\n"
00026
00027 "\tmovss (%0), %%xmm0\n"
00028 "\tshufps $0, %%xmm0, %%xmm0\n"
00029
00030 "\tsub $8, %2\n"
00031 "\tjb mul8_skip%=\n"
00032
00033 "mul8_loop%=:\n"
00034 "\tmovups (%1), %%xmm1\n"
00035 "\tmovups 16(%1), %%xmm2\n"
00036 "\tmulps %%xmm0, %%xmm1\n"
00037 "\tmulps %%xmm0, %%xmm2\n"
00038
00039 "\tmovups (%3), %%xmm3\n"
00040 "\tmovups 16(%3), %%xmm4\n"
00041 "\taddps %%xmm1, %%xmm3\n"
00042 "\taddps %%xmm2, %%xmm4\n"
00043 "\tmovups %%xmm3, (%3)\n"
00044 "\tmovups %%xmm4, 16(%3)\n"
00045
00046 "\tadd $32, %1\n"
00047 "\tadd $32, %3\n"
00048 "\tsub $8, %2\n"
00049
00050 "\tjae mul8_loop%=\n"
00051
00052 "mul8_skip%=:\n"
00053
00054 "\tadd $4, %2\n"
00055 "\tjl mul4_skip%=\n"
00056
00057 "\tmovups (%1), %%xmm1\n"
00058 "\tmulps %%xmm0, %%xmm1\n"
00059 "\tmovups (%3), %%xmm3\n"
00060 "\taddps %%xmm1, %%xmm3\n"
00061 "\tmovups %%xmm3, (%3)\n"
00062
00063 "\tadd $16, %1\n"
00064 "\tadd $16, %3\n"
00065
00066 "\tsub $4, %2\n"
00067
00068 "mul4_skip%=:\n"
00069
00070
00071 "\tadd $4, %2\n"
00072
00073 "\tjmp cond1%=\n"
00074
00075 "mul1_loop%=:\n"
00076
00077 "\tmovss (%1), %%xmm1\n"
00078 "\tmulss %%xmm0, %%xmm1\n"
00079 "\tmovss (%3), %%xmm3\n"
00080 "\taddss %%xmm1, %%xmm3\n"
00081 "\tmovss %%xmm3, (%3)\n"
00082 "\tadd $4, %1\n"
00083 "\tadd $4, %3\n"
00084
00085 "cond1%=:\n"
00086 "\tsub $1, %2\n"
00087 "\tjae mul1_loop%=\n"
00088
00089 "\tpop %3\n"
00090 "\tpop %2\n"
00091 "\tpop %1\n"
00092 "\tpop %0\n"
00093
00094 : : "r" (&a), "r" (b), "q" (len), "r" (c)
00095 CLOBBER_SSE
00096 );
00097 }
00098
00099 inline float vec_inner_prod_sse(const float *a, const float *b, int len)
00100 {
00101 float sum;
00102 __asm__ __volatile__ (
00103
00104 "\tpush %%eax\n"
00105 "\tpush %%edi\n"
00106 "\tpush %%ecx\n"
00107 "\txorps %%xmm3, %%xmm3\n"
00108 "\txorps %%xmm4, %%xmm4\n"
00109
00110 "\tsub $8, %%ecx\n"
00111 "\tjb mul8_skip%=\n"
00112
00113 "mul8_loop%=:\n"
00114 "\tmovups (%%eax), %%xmm0\n"
00115 "\tmovups (%%edi), %%xmm1\n"
00116 "\tmovups 16(%%eax), %%xmm5\n"
00117 "\tmovups 16(%%edi), %%xmm6\n"
00118 "\tadd $32, %%eax\n"
00119 "\tadd $32, %%edi\n"
00120 "\tmulps %%xmm0, %%xmm1\n"
00121 "\tmulps %%xmm5, %%xmm6\n"
00122 "\taddps %%xmm1, %%xmm3\n"
00123 "\taddps %%xmm6, %%xmm4\n"
00124
00125 "\tsub $8, %%ecx\n"
00126
00127 "\tjae mul8_loop%=\n"
00128
00129 "mul8_skip%=:\n"
00130
00131 "\taddps %%xmm4, %%xmm3\n"
00132
00133 "\tadd $4, %%ecx\n"
00134 "\tjl mul4_skip%=\n"
00135
00136 "\tmovups (%%eax), %%xmm0\n"
00137 "\tmovups (%%edi), %%xmm1\n"
00138 "\tadd $16, %%eax\n"
00139 "\tadd $16, %%edi\n"
00140 "\tmulps %%xmm0, %%xmm1\n"
00141 "\taddps %%xmm1, %%xmm3\n"
00142
00143 "\tsub $4, %%ecx\n"
00144
00145 "mul4_skip%=:\n"
00146
00147
00148 "\tadd $4, %%ecx\n"
00149
00150 "\tjmp cond1%=\n"
00151
00152 "mul1_loop%=:\n"
00153 "\tmovss (%%eax), %%xmm0\n"
00154 "\tmovss (%%edi), %%xmm1\n"
00155 "\tadd $4, %%eax\n"
00156 "\tadd $4, %%edi\n"
00157 "\tmulss %%xmm0, %%xmm1\n"
00158 "\taddss %%xmm1, %%xmm3\n"
00159
00160 "cond1%=:\n"
00161 "\tsub $1, %%ecx\n"
00162 "\tjae mul1_loop%=\n"
00163
00164 "\tmovhlps %%xmm3, %%xmm4\n"
00165 "\taddps %%xmm4, %%xmm3\n"
00166 "\tmovaps %%xmm3, %%xmm4\n"
00167
00168 "\tshufps $0x55, %%xmm4, %%xmm4\n"
00169
00170 "\taddss %%xmm4, %%xmm3\n"
00171 "\tmovss %%xmm3, (%%edx)\n"
00172
00173 "\tpop %%ecx\n"
00174 "\tpop %%edi\n"
00175 "\tpop %%eax\n"
00176 "\temms\n"
00177
00178 : : "a" (a), "D" (b), "c" (len), "d" (&sum)
00179 CLOBBER_SSE
00180 );
00181 return sum;
00182 }
00183
00184
00185
00186 inline float vec_mahalanobis2_mul4_sse(const float *a, const float *b, const float *c, int len)
00187 {
00188 float sum=0;
00189 __asm__ __volatile__ (
00190
00191 "\tpush %%eax\n"
00192 "\tpush %%esi\n"
00193 "\tpush %%edi\n"
00194 "\tpush %%ecx\n"
00195 "\txorps %%xmm4, %%xmm4\n"
00196 "\txorps %%xmm5, %%xmm5\n"
00197
00198 "\tsub $8, %%ecx\n"
00199 "\tjb mul8_skip%=\n"
00200
00201 "mul8_loop%=:\n"
00202 "\tmovups (%%eax), %%xmm0\n"
00203 "\tmovups (%%edi), %%xmm1\n"
00204 "m\tovups 16(%%eax), %%xmm2\n"
00205 "\tmovups 16(%%edi), %%xmm3\n"
00206 "\tmovups (%%esi), %%xmm6\n"
00207 "\tmovups 16(%%esi), %%xmm7\n"
00208 "\tadd $32, %%eax\n"
00209 "\tadd $32, %%edi\n"
00210 "\tadd $32, %%esi\n"
00211 "\tsubps %%xmm0, %%xmm1\n"
00212 "\tsubps %%xmm2, %%xmm3\n"
00213 "\tmulps %%xmm1, %%xmm1\n"
00214 "\tmulps %%xmm3, %%xmm3\n"
00215 "\tmulps %%xmm6, %%xmm1\n"
00216 "\tmulps %%xmm7, %%xmm3\n"
00217 "\taddps %%xmm1, %%xmm4\n"
00218 "\taddps %%xmm3, %%xmm5\n"
00219
00220 "\tsub $8, %%ecx\n"
00221 "\tjae mul8_loop%=\n"
00222
00223 "mul8_skip%=:\n"
00224 "\taddps %%xmm5, %%xmm4\n"
00225
00226
00227 "\tadd $4, %%ecx\n"
00228 "\tjl mul4_skip%=\n"
00229
00230 "\tmovups (%%eax), %%xmm0\n"
00231 "\tmovups (%%edi), %%xmm1\n"
00232 "\tmovups (%%esi), %%xmm6\n"
00233 "\tadd $16, %%eax\n"
00234 "\tadd $16, %%edi\n"
00235 "\tadd $16, %%esi\n"
00236
00237 "\tsubps %%xmm0, %%xmm1\n"
00238 "\tmulps %%xmm1, %%xmm1\n"
00239 "\tmulps %%xmm6, %%xmm1\n"
00240 "\taddps %%xmm1, %%xmm4\n"
00241
00242 "\tsub $4, %%ecx\n"
00243
00244 "mul4_skip%=:\n"
00245
00246
00247
00248 "\tmovaps %%xmm4, %%xmm3\n"
00249
00250
00251 "\tmovhlps %%xmm3, %%xmm4\n"
00252 "\taddps %%xmm4, %%xmm3\n"
00253 "\tmovaps %%xmm3, %%xmm4\n"
00254 "\tshufps $33, %%xmm4, %%xmm4\n"
00255 "\taddss %%xmm4, %%xmm3\n"
00256 "\tmovss %%xmm3, (%%edx)\n"
00257
00258
00259 "\tpop %%ecx\n"
00260 "\tpop %%edi\n"
00261 "\tpop %%esi\n"
00262 "\tpop %%eax\n"
00263 "\temms\n"
00264
00265 : : "a" (a), "S" (c), "D" (b), "c" (len), "d" (&sum)
00266 CLOBBER_SSE
00267 );
00268
00269 return sum;
00270 }
00271
00272
00273
00274
00275 inline float vec_dist2_mul4_sse(const float *a, const float *b, int len)
00276 {
00277 float sum=0;
00278 __asm__ __volatile__ (
00279
00280 "\tpush %%eax\n"
00281 "\tpush %%edi\n"
00282 "\tpush %%ecx\n"
00283 "\txorps %%xmm4, %%xmm4\n"
00284 "\txorps %%xmm5, %%xmm5\n"
00285
00286 "\tsub $8, %%ecx\n"
00287 "j\tb mul8_skip%=\n"
00288
00289 "mul8_loop%=:\n"
00290 "\tmovups (%%eax), %%xmm0\n"
00291 "\tmovups (%%edi), %%xmm1\n"
00292 "\tmovups 16(%%eax), %%xmm2\n"
00293 "\tmovups 16(%%edi), %%xmm3\n"
00294 "\tadd $32, %%eax\n"
00295 "\tadd $32, %%edi\n"
00296 "\tsubps %%xmm0, %%xmm1\n"
00297 "\tsubps %%xmm2, %%xmm3\n"
00298 "\tmulps %%xmm1, %%xmm1\n"
00299 "\tmulps %%xmm3, %%xmm3\n"
00300 "\taddps %%xmm1, %%xmm4\n"
00301 "\taddps %%xmm3, %%xmm5\n"
00302
00303 "\tsub $8, %%ecx\n"
00304 "\tjae mul8_loop%=\n"
00305
00306 "mul8_skip%=:\n"
00307 "\taddps %%xmm5, %%xmm4\n"
00308
00309
00310 "\tadd $4, %%ecx\n"
00311 "\tjl mul4_skip%=\n"
00312
00313 "\tmovups (%%eax), %%xmm0\n"
00314 "\tmovups (%%edi), %%xmm1\n"
00315 "\tadd $16, %%eax\n"
00316 "\tadd $16, %%edi\n"
00317
00318 "\tsubps %%xmm0, %%xmm1\n"
00319 "\tmulps %%xmm1, %%xmm1\n"
00320 "\taddps %%xmm1, %%xmm4\n"
00321
00322 "\tsub $4, %%ecx\n"
00323
00324 "mul4_skip%=:\n"
00325
00326
00327
00328 "\tmovaps %%xmm4, %%xmm3\n"
00329
00330
00331 "\tmovhlps %%xmm3, %%xmm4\n"
00332 "\taddps %%xmm4, %%xmm3\n"
00333 "\tmovaps %%xmm3, %%xmm4\n"
00334 "\tshufps $33, %%xmm4, %%xmm4\n"
00335 "\taddss %%xmm4, %%xmm3\n"
00336 "\tmovss %%xmm3, (%%edx)\n"
00337
00338
00339 "\tpop %%ecx\n"
00340 "\tpop %%edi\n"
00341 "\tpop %%eax\n"
00342 "\temms\n"
00343
00344 : : "a" (a), "D" (b), "c" (len), "d" (&sum)
00345 CLOBBER_SSE
00346 );
00347
00348 return sum;
00349 }
00350
00351
00352 #else
00353
00354
00355 #define ERROR_SSE_NI {throw new GeneralException("Trying to use SSE, but Overflow not compiled with _ENABLE_SSE. Bad, bad, this should never happen", __FILE__, __LINE__);}
00356
00357 inline float vec_inner_prod_sse(const float *a, const float *b, int len)
00358 ERROR_SSE_NI
00359
00360 inline float vec_mahalanobis2_mul4_sse(const float *a, const float *b, const float *c, int len)
00361 ERROR_SSE_NI
00362
00363 inline float vec_dist2_mul4_sse(const float *a, const float *b, int len)
00364 ERROR_SSE_NI
00365
00366 #endif
00367
00368 }
00369
00370 #endif