00001
00002
00003
00004
00005 #ifndef VEC_3DNOW_H
00006 #define VEC_3DNOW_H
00007
00008 #include "BaseException.h"
00009
00010 namespace FD {
00011
00012 #ifdef _ENABLE_3DNOW
00013
00014
00015
00016 #define CLOBBER_3DNOW : "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", "memory"
00017
00018 inline float vec_inner_prod_3dnow(const float *a, const float *b, int len)
00019 {
00020 float sum[2]={0,0};
00021 __asm__ __volatile__ (
00022
00023 "\tpush %%eax \n"
00024 "\tpush %%edi \n"
00025 "\tpush %%ecx \n"
00026 "\tpxor %%mm4, %%mm4 \n"
00027 "\tpxor %%mm5, %%mm5 \n"
00028 "\tsub $4, %%ecx \n"
00029 "\tjb mul4_skip%= \n"
00030
00031 "mul4_loop%=: \n"
00032 "\tmovq (%%eax), %%mm0 \n"
00033 "\tmovq (%%edi), %%mm1 \n"
00034 "\tmovq 8(%%eax), %%mm2 \n"
00035 "\tmovq 8(%%edi), %%mm3 \n"
00036 "\tadd $16, %%eax \n"
00037 "\tadd $16, %%edi \n"
00038 "\tpfmul %%mm0, %%mm1 \n"
00039 "\tpfmul %%mm2, %%mm3 \n"
00040 "\tpfadd %%mm1, %%mm4 \n"
00041 "\tpfadd %%mm3, %%mm5 \n"
00042 "\tsub $4, %%ecx \n"
00043 "\tjae mul4_loop%= \n"
00044 "\tpfadd %%mm5,%%mm4 \n"
00045
00046 "mul4_skip%=: \n"
00047 "\tadd $2, %%ecx \n"
00048 "\tjae mul2_skip%= \n"
00049 "\tmovq (%%eax), %%mm0 \n"
00050 "\tmovq (%%edi), %%mm1 \n"
00051 "\tadd $8, %%eax \n"
00052 "\tadd $8, %%edi \n"
00053 "\tpfmul %%mm0, %%mm1 \n"
00054 "\tpfadd %%mm1, %%mm4 \n"
00055
00056 "mul2_skip%=: \n"
00057 "\tand $1, %%ecx \n"
00058 "\tjz even%= \n"
00059 "\tpxor %%mm0, %%mm0 \n"
00060 "\tpxor %%mm1, %%mm1 \n"
00061 "\tmovd (%%eax), %%mm0 \n"
00062 "\tmovd (%%edi), %%mm1 \n"
00063 "\tpfmul %%mm0, %%mm1 \n"
00064 "\tpfadd %%mm1, %%mm4 \n"
00065
00066 "even%=: \n"
00067 "\tpxor %%mm5, %%mm5 \n"
00068 "\tpfacc %%mm5, %%mm4 \n"
00069 "\tmovq %%mm4, (%%edx) \n"
00070 "\tpop %%ecx \n"
00071 "\tpop %%edi \n"
00072 "\tpop %%eax \n"
00073 "\tfemms \n"
00074 : : "a" (a), "D" (b), "c" (len), "d" (sum)
00075 CLOBBER_3DNOW
00076 );
00077
00078 return sum[0];
00079 }
00080
00081
00082
00083 inline void vec_add_vec_3dnow(const float *a, const float *b, float *c, int len)
00084 {
00085 __asm__ __volatile__ (
00086
00087 "\tpush %%eax \n"
00088 "\tpush %%edi \n"
00089 "\tpush %%ecx \n"
00090 "\tpush %%edx \n"
00091
00092 "\tsub $4, %%ecx \n"
00093 "\tjb mul4_skip%= \n"
00094
00095 "mul4_loop%=: \n"
00096 "\tmovq (%%eax), %%mm0 \n"
00097 "\tmovq (%%edi), %%mm1 \n"
00098 "\tpfadd %%mm0, %%mm1 \n"
00099 "\tmovq 8(%%eax), %%mm2 \n"
00100 "\tmovq 8(%%edi), %%mm3 \n"
00101 "\tpfadd %%mm2, %%mm3 \n"
00102
00103 "\tmovq %%mm1, (%%edx) \n"
00104 "\tmovq %%mm3, 8(%%edx) \n"
00105 "\tadd $16, %%eax \n"
00106 "\tadd $16, %%edi \n"
00107 "\tadd $16, %%edx \n"
00108 "\tsub $4, %%ecx \n"
00109
00110 "\tjae mul4_loop%= \n"
00111
00112 "mul4_skip%=: \n"
00113
00114 "\tadd $2, %%ecx \n"
00115 "\tjae mul2_skip%= \n"
00116
00117 "\tmovq (%%eax), %%mm0 \n"
00118 "\tmovq (%%edi), %%mm1 \n"
00119 "\tpfadd %%mm0, %%mm1 \n"
00120 "\tmovq %%mm1, (%%edx) \n"
00121 "\tadd $8, %%eax \n"
00122 "\tadd $8, %%edi \n"
00123 "\tadd $8, %%edx \n"
00124
00125 "mul2_skip%=: \n"
00126
00127 "\tand $1, %%ecx \n"
00128 "\tjz even%= \n"
00129
00130 "\tpxor %%mm0, %%mm0 \n"
00131 "\tpxor %%mm1, %%mm1 \n"
00132 "\tmovd (%%eax), %%mm0 \n"
00133 "\tmovd (%%edi), %%mm1 \n"
00134 "\tpfadd %%mm0, %%mm1 \n"
00135 "\tmovd %%mm1, (%%edx) \n"
00136 "even%=: \n"
00137
00138 "\tpop %%edx \n"
00139 "\tpop %%ecx \n"
00140 "\tpop %%edi \n"
00141 "\tpop %%eax \n"
00142 "\tfemms \n"
00143 : : "a" (a), "D" (b), "c" (len), "d" (c)
00144 CLOBBER_3DNOW
00145 );
00146 }
00147
00148
00149
00150 inline void vec_sub_vec_3dnow(const float *a, const float *b, float *c, int len)
00151 {
00152 __asm__ __volatile__ (
00153
00154 "\tpush %%eax \n"
00155 "\tpush %%edi \n"
00156 "\tpush %%ecx \n"
00157 "\tpush %%edx \n"
00158
00159 "\tsub $4, %%ecx \n"
00160 "\tjb mul4_skip%= \n"
00161
00162 "mul4_loop%=: \n"
00163 "\tmovq (%%eax), %%mm0 \n"
00164 "\tmovq (%%edi), %%mm1 \n"
00165 "\tpfsubr %%mm0, %%mm1 \n"
00166 "\tmovq 8(%%eax), %%mm2 \n"
00167 "\tmovq 8(%%edi), %%mm3 \n"
00168 "\tpfsubr %%mm2, %%mm3 \n"
00169
00170 "\tmovq %%mm1, (%%edx) \n"
00171 "\tmovq %%mm3, 8(%%edx) \n"
00172 "\tadd $16, %%eax \n"
00173 "\tadd $16, %%edi \n"
00174 "\tadd $16, %%edx \n"
00175 "\tsub $4, %%ecx \n"
00176
00177 "\tjae mul4_loop%= \n"
00178
00179 "mul4_skip%=: \n"
00180
00181 "\tadd $2, %%ecx \n"
00182 "\tjae mul2_skip%= \n"
00183
00184 "\tmovq (%%eax), %%mm0 \n"
00185 "\tmovq (%%edi), %%mm1 \n"
00186 "\tpfsubr %%mm0, %%mm1 \n"
00187 "\tmovq %%mm1, (%%edx) \n"
00188 "\tadd $8, %%eax \n"
00189 "\tadd $8, %%edi \n"
00190 "\tadd $8, %%edx \n"
00191
00192 "\tmul2_skip%=: \n"
00193
00194 "\tand $1, %%ecx \n"
00195 "\tjz even%= \n"
00196
00197 "\tpxor %%mm0, %%mm0 \n"
00198 "\tpxor %%mm1, %%mm1 \n"
00199 "\tmovd (%%eax), %%mm0 \n"
00200 "\tmovd (%%edi), %%mm1 \n"
00201 "\tpfsubr %%mm0, %%mm1 \n"
00202 "\tmovd %%mm1, (%%edx) \n"
00203 "even%=: \n"
00204
00205 "\tpop %%edx \n"
00206 "\tpop %%ecx \n"
00207 "\tpop %%edi \n"
00208 "\tpop %%eax \n"
00209 "\tfemms \n"
00210 : : "a" (a), "D" (b), "c" (len), "d" (c)
00211 CLOBBER_3DNOW
00212 );
00213 }
00214
00215
00216
00217 inline void vec_mul_vec_3dnow(const float *a, const float *b, float *c, int len)
00218 {
00219 __asm__ __volatile__ (
00220
00221 "\tpush %%eax \n"
00222 "\tpush %%edi \n"
00223 "\tpush %%ecx \n"
00224 "\tpush %%edx \n"
00225
00226 "\tsub $4, %%ecx \n"
00227 "\tjb mul4_skip%= \n"
00228
00229 "mul4_loop%=: \n"
00230 "\tmovq (%%eax), %%mm0 \n"
00231 "\tmovq (%%edi), %%mm1 \n"
00232 "\tpfmul %%mm0, %%mm1 \n"
00233 "\tmovq 8(%%eax), %%mm2 \n"
00234 "\tmovq 8(%%edi), %%mm3 \n"
00235 "\tpfmul %%mm2, %%mm3 \n"
00236
00237 "\tmovq %%mm1, (%%edx) \n"
00238 "\tmovq %%mm3, 8(%%edx) \n"
00239 "\tadd $16, %%eax \n"
00240 "\tadd $16, %%edi \n"
00241 "\tadd $16, %%edx \n"
00242 "\tsub $4, %%ecx \n"
00243
00244 "\tjae mul4_loop%= \n"
00245
00246 "mul4_skip%=: \n"
00247
00248 "\tadd $2, %%ecx \n"
00249 "\tjae mul2_skip%= \n"
00250
00251 "\tmovq (%%eax), %%mm0 \n"
00252 "\tmovq (%%edi), %%mm1 \n"
00253 "\tpfmul %%mm0, %%mm1 \n"
00254 "\tmovq %%mm1, (%%edx) \n"
00255 "\tadd $8, %%eax \n"
00256 "\tadd $8, %%edi \n"
00257 "\tadd $8, %%edx \n"
00258
00259 "\tmul2_skip%=: \n"
00260
00261 "\tand $1, %%ecx \n"
00262 "\tjz even%= \n"
00263
00264 "\tpxor %%mm0, %%mm0 \n"
00265 "\tpxor %%mm1, %%mm1 \n"
00266 "\tmovd (%%eax), %%mm0 \n"
00267 "\tmovd (%%edi), %%mm1 \n"
00268 "\tpfmul %%mm0, %%mm1 \n"
00269 "\tmovd %%mm1, (%%edx) \n"
00270 "even%=: \n"
00271
00272 "\tpop %%edx \n"
00273 "\tpop %%ecx \n"
00274 "\tpop %%edi \n"
00275 "\tpop %%eax \n"
00276 "\tfemms \n"
00277 : : "a" (a), "D" (b), "c" (len), "d" (c)
00278 CLOBBER_3DNOW
00279 );
00280 }
00281
00282
00283
00284 inline void vec_add_scal_3dnow(const float a, const float *b, float *c, int len)
00285 {
00286 float tmp[2];
00287 tmp[0]=tmp[1]=a;
00288 __asm__ __volatile__ (
00289
00290 "\tpush %%edi \n"
00291 "\tpush %%ecx \n"
00292 "\tpush %%edx \n"
00293 "\tmovq (%%eax), %%mm0 \n"
00294
00295 "\tsub $4, %%ecx \n"
00296 "\tjb mul4_skip%= \n"
00297
00298 "mul4_loop%=: \n"
00299 "\tmovq (%%edi), %%mm1 \n"
00300 "\tpfadd %%mm0, %%mm1 \n"
00301 "\tmovq 8(%%edi), %%mm2 \n"
00302 "\tpfadd %%mm0, %%mm2 \n"
00303
00304 "\tmovq %%mm1, (%%edx) \n"
00305 "\tmovq %%mm2, 8(%%edx) \n"
00306 "\tadd $16, %%edi \n"
00307 "\tadd $16, %%edx \n"
00308 "\tsub $4, %%ecx \n"
00309
00310 "\tjae mul4_loop%= \n"
00311
00312 "mul4_skip%=: \n"
00313
00314 "\tadd $2, %%ecx \n"
00315 "\tjae mul2_skip%= \n"
00316
00317 "\tmovq (%%edi), %%mm1 \n"
00318 "\tpfadd %%mm0, %%mm1 \n"
00319 "\tmovq %%mm1, (%%edx) \n"
00320 "\tadd $8, %%edi \n"
00321 "\tadd $8, %%edx \n"
00322
00323 "mul2_skip%=: \n"
00324
00325 "\tand $1, %%ecx \n"
00326 "\tjz even%= \n"
00327
00328 "\tpxor %%mm0, %%mm0 \n"
00329 "\tpxor %%mm1, %%mm1 \n"
00330 "\tmovd (%%eax), %%mm0 \n"
00331 "\tmovd (%%edi), %%mm1 \n"
00332 "\tpfadd %%mm0, %%mm1 \n"
00333 "\tmovd %%mm1, (%%edx) \n"
00334 "even%=: \n"
00335
00336 "\tpop %%edx \n"
00337 "\tpop %%ecx \n"
00338 "\tpop %%edi \n"
00339 "\tfemms \n"
00340
00341 : : "a" (tmp), "D" (b), "c" (len), "d" (c)
00342 CLOBBER_3DNOW
00343 );
00344 }
00345
00346
00347
00348 inline void vec_mul_scal_3dnow(const float a, const float *b, float *c, int len)
00349 {
00350 float tmp[2];
00351 tmp[0]=tmp[1]=a;
00352 __asm__ __volatile__ (
00353
00354 "\tpush %%edi \n"
00355 "\tpush %%ecx \n"
00356 "\tpush %%edx \n"
00357 "\tmovq (%%eax), %%mm0 \n"
00358
00359 "\tsub $4, %%ecx \n"
00360 "\tjb mul4_skip%= \n"
00361
00362 "mul4_loop%=: \n"
00363 "\tmovq (%%edi), %%mm1 \n"
00364 "\tpfmul %%mm0, %%mm1 \n"
00365 "\tmovq 8(%%edi), %%mm2 \n"
00366 "\tpfmul %%mm0, %%mm2 \n"
00367
00368 "\tmovq %%mm1, (%%edx) \n"
00369 "\tmovq %%mm2, 8(%%edx) \n"
00370 "\tadd $16, %%edi \n"
00371 "\tadd $16, %%edx \n"
00372 "\tsub $4, %%ecx \n"
00373
00374 "\tjae mul4_loop%= \n"
00375
00376 "mul4_skip%=: \n"
00377
00378 "\tadd $2, %%ecx \n"
00379 "\tjae mul2_skip%= \n"
00380
00381 "\tmovq (%%edi), %%mm1 \n"
00382 "\tpfmul %%mm0, %%mm1 \n"
00383 "\tmovq %%mm1, (%%edx) \n"
00384 "\tadd $8, %%edi \n"
00385 "\tadd $8, %%edx \n"
00386
00387 "mul2_skip%=: \n"
00388
00389 "\tand $1, %%ecx \n"
00390 "\tjz even%= \n"
00391
00392 "\tpxor %%mm0, %%mm0 \n"
00393 "\tpxor %%mm1, %%mm1 \n"
00394 "\tmovd (%%eax), %%mm0 \n"
00395 "\tmovd (%%edi), %%mm1 \n"
00396 "\tpfmul %%mm0, %%mm1 \n"
00397 "\tmovd %%mm1, (%%edx) \n"
00398 "even%=: \n"
00399
00400 "\tpop %%edx \n"
00401 "\tpop %%ecx \n"
00402 "\tpop %%edi \n"
00403 "\tfemms \n"
00404
00405 : : "a" (tmp), "D" (b), "c" (len), "d" (c)
00406 CLOBBER_3DNOW
00407 );
00408 }
00409
00410
00411
00412 inline float vec_dist_3dnow(const float *a, const float *b, int len)
00413 {
00414
00415 float sum[2]={0,0};
00416 __asm__ __volatile__ (
00417
00418 "\tpush %%eax \n"
00419 "\tpush %%edi \n"
00420 "\tpush %%ecx \n"
00421 "\tpxor %%mm4, %%mm4 \n"
00422 "\tpxor %%mm5, %%mm5 \n"
00423
00424 "\tsub $4, %%ecx \n"
00425 "\tjb mul4_skip%= \n"
00426
00427 "mul4_loop%=: \n"
00428 "\tmovq (%%eax), %%mm0 \n"
00429 "\tmovq (%%edi), %%mm1 \n"
00430 "\tmovq 8(%%eax), %%mm2 \n"
00431 "\tmovq 8(%%edi), %%mm3 \n"
00432 "\tadd $16, %%eax \n"
00433 "\tadd $16, %%edi \n"
00434 "\tpfsub %%mm0, %%mm1 \n"
00435 "\tpfsub %%mm2, %%mm3 \n"
00436 "\tpfmul %%mm1, %%mm1 \n"
00437 "\tpfmul %%mm3, %%mm3 \n"
00438 "\tpfadd %%mm1, %%mm4 \n"
00439 "\tpfadd %%mm3, %%mm5 \n"
00440
00441 "\tsub $4, %%ecx \n"
00442
00443 "\tjae mul4_loop%= \n"
00444
00445 "\tpfadd %%mm5,%%mm4 \n"
00446
00447 "mul4_skip%=: \n"
00448
00449 "\tadd $2, %%ecx \n"
00450 "\tjae mul2_skip%= \n"
00451
00452 "\tmovq (%%eax), %%mm0 \n"
00453 "\tmovq (%%edi), %%mm1 \n"
00454 "\tadd $8, %%eax \n"
00455 "\tadd $8, %%edi \n"
00456 "\tpfsub %%mm0, %%mm1 \n"
00457 "\tpfmul %%mm1, %%mm1 \n"
00458 "\tpfadd %%mm1, %%mm4 \n"
00459 "mul2_skip%=: \n"
00460
00461 "\tand $1, %%ecx \n"
00462 "\tjz even%= \n"
00463
00464 "\tpxor %%mm0, %%mm0 \n"
00465 "\tpxor %%mm1, %%mm1 \n"
00466 "\tmovd (%%eax), %%mm0 \n"
00467 "\tmovd (%%edi), %%mm1 \n"
00468 "\tpfsub %%mm0, %%mm1 \n"
00469 "\tpfmul %%mm1, %%mm1 \n"
00470 "\tpfadd %%mm1, %%mm4 \n"
00471 "even%=: \n"
00472
00473 "\tpxor %%mm5, %%mm5 \n"
00474 "\tpfacc %%mm5, %%mm4 \n"
00475 "\tmovq %%mm4, (%%edx) \n"
00476
00477 "\tpop %%ecx \n"
00478 "\tpop %%edi \n"
00479 "\tpop %%eax \n"
00480 "\tfemms \n"
00481
00482 : : "a" (a), "D" (b), "c" (len), "d" (sum)
00483 CLOBBER_3DNOW
00484 );
00485
00486 return sum[0];
00487 }
00488
00489
00490 inline float vec_mahalanobis_3dnow(const float *a, const float *b, const float *c, int len)
00491 {
00492 float sum[2]={0,0};
00493 __asm__ __volatile__ (
00494
00495 "\tpush %%eax \n"
00496 "\tpush %%esi \n"
00497 "\tpush %%edi \n"
00498 "\tpush %%ecx \n"
00499 "\tpxor %%mm4, %%mm4 \n"
00500 "\tpxor %%mm5, %%mm5 \n"
00501
00502 "\tsub $4, %%ecx \n"
00503 "\tjb mul4_skip%= \n"
00504
00505 "mul4_loop%=: \n"
00506 "\tmovq (%%eax), %%mm0 \n"
00507 "\tmovq (%%edi), %%mm1 \n"
00508 "\tmovq 8(%%eax), %%mm2 \n"
00509 "\tmovq 8(%%edi), %%mm3 \n"
00510 "\tmovq (%%esi), %%mm6 \n"
00511 "\tmovq 8(%%esi), %%mm7 \n"
00512 "\tadd $16, %%eax \n"
00513 "\tadd $16, %%edi \n"
00514 "\tadd $16, %%esi \n"
00515 "\tpfsub %%mm0, %%mm1 \n"
00516 "\tpfsub %%mm2, %%mm3 \n"
00517 "\tpfmul %%mm1, %%mm1 \n"
00518 "\tpfmul %%mm3, %%mm3 \n"
00519 "\tpfmul %%mm6, %%mm1 \n"
00520 "\tpfmul %%mm7, %%mm3 \n"
00521 "\tpfadd %%mm1, %%mm4 \n"
00522 "\tpfadd %%mm3, %%mm5 \n"
00523
00524 "\tsub $4, %%ecx \n"
00525 "\tjae mul4_loop%= \n"
00526
00527 "mul4_skip%=: \n"
00528
00529 "\tpfadd %%mm5, %%mm4 \n"
00530
00531
00532
00533
00534 "\tadd $2, %%ecx \n"
00535 "\tjae mul2_skip%= \n"
00536
00537 "\tmovq (%%eax), %%mm0 \n"
00538 "\tmovq (%%edi), %%mm1 \n"
00539 "\tmovq (%%esi), %%mm6 \n"
00540 "\tadd $8, %%eax \n"
00541 "\tadd $8, %%edi \n"
00542 "\tadd $8, %%esi \n"
00543 "\tpfsub %%mm0, %%mm1 \n"
00544 "\tpfmul %%mm1, %%mm1 \n"
00545 "\tpfmul %%mm6, %%mm1 \n"
00546 "\tpfadd %%mm1, %%mm4 \n"
00547 "mul2_skip%=: \n"
00548
00549 "\tand $1, %%ecx \n"
00550 "\tjz even%= \n"
00551
00552 "\tpxor %%mm0, %%mm0 \n"
00553 "\tpxor %%mm1, %%mm1 \n"
00554 "\tpxor %%mm6, %%mm6 \n"
00555 "\tmovd (%%eax), %%mm0 \n"
00556 "\tmovd (%%edi), %%mm1 \n"
00557 "\tmovd (%%esi), %%mm6 \n"
00558 "\tpfsub %%mm0, %%mm1 \n"
00559 "\tpfmul %%mm1, %%mm1 \n"
00560 "\tpfmul %%mm6, %%mm1 \n"
00561 "\tpfadd %%mm1, %%mm4 \n"
00562
00563 "even%=: \n"
00564
00565
00566
00567
00568 "\tpxor %%mm5, %%mm5 \n"
00569 "\tpfacc %%mm5, %%mm4 \n"
00570 "\tmovq %%mm4, (%%edx) \n"
00571
00572 "\t pop %%ecx \n"
00573 "\tpop %%edi \n"
00574 "\tpop %%esi \n"
00575 "\tpop %%eax \n"
00576 "\tfemms \n"
00577
00578 : : "a" (a), "S" (c), "D" (b), "c" (len), "d" (sum)
00579 CLOBBER_3DNOW
00580 );
00581
00582 return sum[0];
00583 }
00584
00585
00586 inline float vec_sum_3dnow(const float *a, int len)
00587 {
00588
00589 float sum[2]={0,0};
00590 __asm__ __volatile__ (
00591
00592 "\tpush %%eax \n"
00593 "\tpush %%ecx \n"
00594 "\tpxor %%mm4, %%mm4 \n"
00595 "\tpxor %%mm5, %%mm5 \n"
00596
00597 "\tsub $4, %%ecx \n"
00598 "\tjb mul4_skip%= \n"
00599
00600 "mul4_loop%=: \n"
00601 "\tmovq (%%eax), %%mm0 \n"
00602 "\tpfadd %%mm0, %%mm4 \n"
00603 "\tmovq 8(%%eax), %%mm1 \n"
00604 "\tpfadd %%mm1, %%mm5 \n"
00605
00606 "\tadd $16, %%eax \n"
00607 "\tsub $4, %%ecx \n"
00608
00609 "\tjae mul4_loop%= \n"
00610
00611 "\tpfadd %%mm5,%%mm4 \n"
00612
00613 "mul4_skip%=: \n"
00614
00615 "\tadd $2, %%ecx \n"
00616 "\tjae mul2_skip%= \n"
00617
00618 "\tmovq (%%eax), %%mm0 \n"
00619 "\tadd $8, %%eax \n"
00620 "\tpfadd %%mm0, %%mm4 \n"
00621 "mul2_skip%=: \n"
00622
00623 "\tand $1, %%ecx \n"
00624 "\tjz even%= \n"
00625
00626 "\tpxor %%mm0, %%mm0 \n"
00627 "\tmovd (%%eax), %%mm0 \n"
00628 "\tpfadd %%mm0, %%mm4 \n"
00629 "even%=: \n"
00630
00631 "\tpxor %%mm5, %%mm5 \n"
00632 "\tpfacc %%mm5, %%mm4 \n"
00633 "\tmovq %%mm4, (%%edx) \n"
00634
00635 "\tpop %%ecx \n"
00636 "\tpop %%eax \n"
00637 "\tfemms \n"
00638
00639 : : "a" (a), "c" (len), "d" (sum)
00640 CLOBBER_3DNOW
00641 );
00642
00643 return sum[0];
00644 }
00645
00646
00647 inline float vec_norm2_3dnow(const float *a, int len)
00648 {
00649
00650 float sum[2]={0,0};
00651 __asm__ __volatile__ (
00652
00653 "\tpush %%eax \n"
00654 "\tpush %%ecx \n"
00655 "\tpxor %%mm4, %%mm4 \n"
00656 "\tpxor %%mm5, %%mm5 \n"
00657
00658 "\tsub $4, %%ecx \n"
00659 "\tjb mul4_skip%= \n"
00660
00661 "mul4_loop%=: \n"
00662 "\tmovq (%%eax), %%mm0 \n"
00663 "\tpfmul %%mm0, %%mm0 \n"
00664 "\tpfadd %%mm0, %%mm4 \n"
00665 "\tmovq 8(%%eax), %%mm1 \n"
00666 "\tpfmul %%mm1, %%mm1 \n"
00667 "\tpfadd %%mm1, %%mm5 \n"
00668
00669 "\tadd $16, %%eax \n"
00670 "\tsub $4, %%ecx \n"
00671
00672 "\tjae mul4_loop%= \n"
00673
00674 "\tpfadd %%mm5,%%mm4 \n"
00675
00676 "mul4_skip%=: \n"
00677
00678 "\tadd $2, %%ecx \n"
00679 "\tjae mul2_skip%= \n"
00680
00681 "\tmovq (%%eax), %%mm0 \n"
00682 "\tpfmul %%mm0, %%mm0 \n"
00683 "\tadd $8, %%eax \n"
00684 "\tpfadd %%mm0, %%mm4 \n"
00685 "mul2_skip%=: \n"
00686
00687 "\tand $1, %%ecx \n"
00688 "\tjz even%= \n"
00689
00690 "\tpxor %%mm0, %%mm0 \n"
00691 "\tmovd (%%eax), %%mm0 \n"
00692 "\tpfmul %%mm0, %%mm0 \n"
00693 "\tpfadd %%mm0, %%mm4 \n"
00694 "even%=: \n"
00695
00696 "\tpxor %%mm5, %%mm5 \n"
00697 "\tpfacc %%mm5, %%mm4 \n"
00698 "\tmovq %%mm4, (%%edx) \n"
00699
00700 "\tpop %%ecx \n"
00701 "\tpop %%eax \n"
00702 "\tfemms \n"
00703
00704 : : "a" (a), "c" (len), "d" (sum)
00705 CLOBBER_3DNOW
00706 );
00707
00708 return sum[0];
00709 }
00710
00711
00712
00713 inline void vec_inv_3dnow(const float *a, float *b, int len)
00714 {
00715 __asm__ __volatile__ (
00716
00717 "\tpush %%eax \n"
00718 "\tpush %%ecx \n"
00719 "\tpush %%edx \n"
00720
00721 "\tsub $2, %%ecx \n"
00722 "\tjb mul2_skip%= \n"
00723
00724 "mul2_loop%=: \n"
00725 "\tmovd (%%eax), %%mm0 \n"
00726 "\tpfrcp %%mm0, %%mm2 \n"
00727 "\tmovd 4(%%eax), %%mm1 \n"
00728 "\tpfrcp %%mm1, %%mm3 \n"
00729
00730
00731 "\tpfrcpit1 %%mm2, %%mm0 \n"
00732 "\tpfrcpit1 %%mm3, %%mm1 \n"
00733 "\tpfrcpit2 %%mm2, %%mm0 \n"
00734 "\tpfrcpit2 %%mm3, %%mm1 \n"
00735
00736 "\tmovd %%mm0, (%%edx) \n"
00737 "\tmovd %%mm1, 4(%%edx) \n"
00738 "\tadd $8, %%eax \n"
00739 "\tadd $8, %%edx \n"
00740 "\tsub $2, %%ecx \n"
00741
00742 "\tjae mul2_loop%= \n"
00743
00744 "mul2_skip%=: \n"
00745
00746 "\tand $1, %%ecx \n"
00747 "\tjz even%= \n"
00748
00749 "\tmovd (%%eax), %%mm0 \n"
00750 "\tpfrcp %%mm0, %%mm2 \n"
00751
00752 "\tpfrcpit1 %%mm2, %%mm0 \n"
00753 "\tpfrcpit2 %%mm2, %%mm0 \n"
00754 "\tmovd %%mm0, (%%edx) \n"
00755 "even%=: \n"
00756
00757 "\tpop %%edx \n"
00758 "\tpop %%ecx \n"
00759 "\tpop %%eax \n"
00760 "\tfemms \n"
00761
00762 : : "a" (a), "c" (len), "d" (b)
00763 CLOBBER_3DNOW
00764 );
00765 }
00766
00767
00768
00769 inline void vec_sqrt_3dnow(const float *a, float *b, int len)
00770 {
00771 __asm__ __volatile__ (
00772
00773 "\tpush %%eax \n"
00774 "\tpush %%ecx \n"
00775 "\tpush %%edx \n"
00776
00777 "\tsub $2, %%ecx \n"
00778 "\tjb mul2_skip%= \n"
00779
00780 "mul2_loop%=: \n"
00781 "\tmovd (%%eax), %%mm0 \n"
00782 "\tmovd 4(%%eax), %%mm1 \n"
00783 "\tmovq %%mm0, %%mm6 \n"
00784 "\tmovq %%mm1, %%mm7 \n"
00785 "\tpfrsqrt %%mm0, %%mm2 \n"
00786 "\tpfrsqrt %%mm1, %%mm3 \n"
00787 "\tmovq %%mm2, %%mm4 \n"
00788 "\tmovq %%mm3, %%mm5 \n"
00789 "\tpfmul %%mm2, %%mm2 \n"
00790 "\tpfmul %%mm3, %%mm3 \n"
00791 "\tpfrsqit1 %%mm2, %%mm0 \n"
00792 "\tpfrsqit1 %%mm3, %%mm1 \n"
00793 "\tpfrcpit2 %%mm4, %%mm0 \n"
00794 "\tpfrcpit2 %%mm5, %%mm1 \n"
00795 "\tpfmul %%mm6, %%mm0 \n"
00796 "\tpfmul %%mm7, %%mm1 \n"
00797
00798 "\tmovd %%mm0, (%%edx) \n"
00799 "\tmovd %%mm1, 4(%%edx) \n"
00800 "\tadd $8, %%eax \n"
00801 "\tadd $8, %%edx \n"
00802 "\tsub $2, %%ecx \n"
00803
00804 "\tjae mul2_loop%= \n"
00805
00806 "mul2_skip%=: \n"
00807
00808 "\tand $1, %%ecx \n"
00809 "\tjz even%= \n"
00810
00811 "\tmovd (%%eax), %%mm0 \n"
00812 "\tmovq %%mm0, %%mm6 \n"
00813 "\tpfrsqrt %%mm0, %%mm2 \n"
00814 "\tmovq %%mm2, %%mm4 \n"
00815 "\tpfmul %%mm2, %%mm2 \n"
00816 "\tpfrsqit1 %%mm2, %%mm0 \n"
00817 "\tpfrcpit2 %%mm4, %%mm0 \n"
00818 "\tpfmul %%mm6, %%mm0 \n"
00819
00820 "\tmovd %%mm0, (%%edx) \n"
00821 "even%=: \n"
00822
00823 "\tpop %%edx \n"
00824 "\tpop %%ecx \n"
00825 "\tpop %%eax \n"
00826 "\tfemms \n"
00827
00828 : : "a" (a), "c" (len), "d" (b)
00829 CLOBBER_3DNOW
00830 );
00831 }
00832
00833 #else
00834
00835 #define ERROR_3DNOW_NI {throw new GeneralException("Trying to use 3DNow!, but Overflow not compiled with _ENABLE_3DNOW. Bad, bad, this should never happen", __FILE__, __LINE__);}
00836
00837 inline float vec_inner_prod_3dnow(const float *a, const float *b, int len)
00838 ERROR_3DNOW_NI
00839
00840 inline void vec_add_vec_3dnow(const float *a, const float *b, float *c, int len)
00841 ERROR_3DNOW_NI
00842
00843 inline void vec_sub_vec_3dnow(const float *a, const float *b, float *c, int len)
00844 ERROR_3DNOW_NI
00845
00846 inline void vec_mul_vec_3dnow(const float *a, const float *b, float *c, int len)
00847 ERROR_3DNOW_NI
00848
00849 inline void vec_add_scal_3dnow(const float a, const float *b, float *c, int len)
00850 ERROR_3DNOW_NI
00851
00852 inline void vec_mul_scal_3dnow(const float a, const float *b, float *c, int len)
00853 ERROR_3DNOW_NI
00854
00855 inline float vec_dist_3dnow(const float *a, const float *b, int len)
00856 ERROR_3DNOW_NI
00857
00858 inline float vec_mahalanobis_3dnow(const float *a, const float *b, const float *c, int len)
00859 ERROR_3DNOW_NI
00860
00861 inline float vec_sum_3dnow(const float *a, int len)
00862 ERROR_3DNOW_NI
00863
00864 inline float vec_norm2_3dnow(const float *a, int len)
00865 ERROR_3DNOW_NI
00866
00867 inline void vec_inv_3dnow(const float *a, float *b, int len)
00868 ERROR_3DNOW_NI
00869
00870 inline void vec_sqrt_3dnow(const float *a, float *b, int len)
00871 ERROR_3DNOW_NI
00872
00873 #endif
00874
00875 }
00876
00877 #endif