FlowDesigner: vec_sse.h Source File

00001 // Copyright (C) 2001 Jean-Marc Valin
00002 //This file contains SSE-optimized vector primitives
00003 
00004 
00005 #ifndef VEC_SSE_H
00006 #define VEC_SSE_H
00007 
00008 #include "BaseException.h"
00009 
00010 namespace FD {
00011 
00012 #ifdef _ENABLE_SSE
00013 
00014 #define CLOBBER_SSE : "memory"
00015 
00016 
00017 //template <>
00018 inline void vec_mul_and_add_sse(const float a, const float *b, float *c, int len)
00019 {
00020   __asm__ __volatile__ (
00021   
00022   "\tpush %0\n"
00023   "\tpush %1\n"
00024   "\tpush %2\n"
00025   "\tpush %3\n"
00026 
00027   "\tmovss (%0), %%xmm0\n"
00028   "\tshufps $0, %%xmm0, %%xmm0\n"
00029 
00030   "\tsub $8, %2\n"
00031   "\tjb mul8_skip%=\n"
00032 
00033 "mul8_loop%=:\n"
00034   "\tmovups (%1), %%xmm1\n"
00035   "\tmovups 16(%1), %%xmm2\n"
00036   "\tmulps %%xmm0, %%xmm1\n"
00037   "\tmulps %%xmm0, %%xmm2\n"
00038   
00039   "\tmovups (%3), %%xmm3\n"
00040   "\tmovups 16(%3), %%xmm4\n"
00041   "\taddps %%xmm1, %%xmm3\n"
00042   "\taddps %%xmm2, %%xmm4\n"
00043   "\tmovups %%xmm3, (%3)\n"
00044   "\tmovups %%xmm4, 16(%3)\n"
00045 
00046   "\tadd $32, %1\n"
00047   "\tadd $32, %3\n"
00048   "\tsub $8,  %2\n"
00049 
00050   "\tjae mul8_loop%=\n"
00051 
00052 "mul8_skip%=:\n"
00053 
00054   "\tadd $4, %2\n"
00055   "\tjl mul4_skip%=\n"
00056 
00057   "\tmovups (%1), %%xmm1\n"
00058   "\tmulps %%xmm0, %%xmm1\n"
00059   "\tmovups (%3), %%xmm3\n"
00060   "\taddps %%xmm1, %%xmm3\n"
00061   "\tmovups %%xmm3, (%3)\n"
00062 
00063   "\tadd $16, %1\n"
00064   "\tadd $16, %3\n"
00065 
00066   "\tsub $4,  %2\n"
00067 
00068 "mul4_skip%=:\n"
00069 
00070 
00071   "\tadd $4, %2\n"
00072 
00073   "\tjmp cond1%=\n"
00074 
00075 "mul1_loop%=:\n"
00076 
00077   "\tmovss (%1), %%xmm1\n"
00078   "\tmulss %%xmm0, %%xmm1\n"
00079   "\tmovss (%3), %%xmm3\n"
00080   "\taddss %%xmm1, %%xmm3\n"
00081   "\tmovss %%xmm3, (%3)\n"
00082   "\tadd $4, %1\n"
00083   "\tadd $4, %3\n"
00084 
00085 "cond1%=:\n"
00086   "\tsub $1, %2\n"
00087   "\tjae mul1_loop%=\n"
00088 
00089   "\tpop %3\n"
00090   "\tpop %2\n"
00091   "\tpop %1\n"
00092   "\tpop %0\n"
00093   
00094   : : "r" (&a), "r" (b), "q" (len), "r" (c)
00095   CLOBBER_SSE
00096   );
00097 }
00098 
00099 inline float vec_inner_prod_sse(const float *a, const float *b, int len)
00100 {
00101   float sum;
00102   __asm__ __volatile__ (
00103   
00104   "\tpush %%eax\n"
00105   "\tpush %%edi\n"
00106   "\tpush %%ecx\n"
00107   "\txorps %%xmm3, %%xmm3\n"
00108   "\txorps %%xmm4, %%xmm4\n"
00109 
00110   "\tsub $8, %%ecx\n"
00111   "\tjb mul8_skip%=\n"
00112 
00113 "mul8_loop%=:\n"
00114   "\tmovups (%%eax), %%xmm0\n"
00115   "\tmovups (%%edi), %%xmm1\n"
00116   "\tmovups 16(%%eax), %%xmm5\n"
00117   "\tmovups 16(%%edi), %%xmm6\n"
00118   "\tadd $32, %%eax\n"
00119   "\tadd $32, %%edi\n"
00120   "\tmulps %%xmm0, %%xmm1\n"
00121   "\tmulps %%xmm5, %%xmm6\n"
00122   "\taddps %%xmm1, %%xmm3\n"
00123   "\taddps %%xmm6, %%xmm4\n"
00124 
00125   "\tsub $8,  %%ecx\n"
00126 
00127   "\tjae mul8_loop%=\n"
00128 
00129 "mul8_skip%=:\n"
00130 
00131   "\taddps %%xmm4, %%xmm3\n"
00132 
00133   "\tadd $4, %%ecx\n"
00134   "\tjl mul4_skip%=\n"
00135 
00136   "\tmovups (%%eax), %%xmm0\n"
00137   "\tmovups (%%edi), %%xmm1\n"
00138   "\tadd $16, %%eax\n"
00139   "\tadd $16, %%edi\n"
00140   "\tmulps %%xmm0, %%xmm1\n"
00141   "\taddps %%xmm1, %%xmm3\n"
00142 
00143   "\tsub $4,  %%ecx\n"
00144 
00145 "mul4_skip%=:\n"
00146 
00147 
00148   "\tadd $4, %%ecx\n"
00149 
00150   "\tjmp cond1%=\n"
00151 
00152 "mul1_loop%=:\n"
00153   "\tmovss (%%eax), %%xmm0\n"
00154   "\tmovss (%%edi), %%xmm1\n"
00155   "\tadd $4, %%eax\n"
00156   "\tadd $4, %%edi\n"
00157   "\tmulss %%xmm0, %%xmm1\n"
00158   "\taddss %%xmm1, %%xmm3\n"
00159 
00160 "cond1%=:\n"
00161   "\tsub $1, %%ecx\n"
00162   "\tjae mul1_loop%=\n"
00163 
00164   "\tmovhlps %%xmm3, %%xmm4\n"
00165   "\taddps %%xmm4, %%xmm3\n"
00166   "\tmovaps %%xmm3, %%xmm4\n"
00167   //FIXME: which one?
00168   "\tshufps $0x55, %%xmm4, %%xmm4\n"
00169   //shufps $33, %%xmm4, %%xmm4
00170   "\taddss %%xmm4, %%xmm3\n"
00171   "\tmovss %%xmm3, (%%edx)\n"
00172   
00173   "\tpop %%ecx\n"
00174   "\tpop %%edi\n"
00175   "\tpop %%eax\n"
00176   "\temms\n"
00177   
00178   : : "a" (a), "D" (b), "c" (len), "d" (&sum)
00179 CLOBBER_SSE
00180   );
00181   return sum;
00182 }
00183 
00184 //WARNING:
00185 //FIXME: Does not work yet with lengths that are not a multiple of 4
00186 inline float vec_mahalanobis2_mul4_sse(const float *a, const float *b, const float *c, int len)
00187 {
00188    float sum=0;
00189    __asm__ __volatile__ (
00190    
00191    "\tpush %%eax\n"
00192    "\tpush %%esi\n"
00193    "\tpush %%edi\n"
00194    "\tpush %%ecx\n"
00195    "\txorps %%xmm4, %%xmm4\n"
00196    "\txorps %%xmm5, %%xmm5\n"
00197 
00198    "\tsub $8, %%ecx\n"
00199    "\tjb mul8_skip%=\n"
00200 
00201 "mul8_loop%=:\n"
00202    "\tmovups (%%eax), %%xmm0\n"
00203    "\tmovups (%%edi), %%xmm1\n"
00204    "m\tovups 16(%%eax), %%xmm2\n"
00205    "\tmovups 16(%%edi), %%xmm3\n"
00206    "\tmovups (%%esi), %%xmm6\n"
00207    "\tmovups 16(%%esi), %%xmm7\n"
00208    "\tadd $32, %%eax\n"
00209    "\tadd $32, %%edi\n"
00210    "\tadd $32, %%esi\n"
00211    "\tsubps %%xmm0, %%xmm1\n"
00212    "\tsubps %%xmm2, %%xmm3\n"
00213    "\tmulps %%xmm1, %%xmm1\n"
00214    "\tmulps %%xmm3, %%xmm3\n"
00215    "\tmulps %%xmm6, %%xmm1\n"
00216    "\tmulps %%xmm7, %%xmm3\n"
00217    "\taddps %%xmm1, %%xmm4\n"
00218    "\taddps %%xmm3, %%xmm5\n"
00219 
00220    "\tsub $8,  %%ecx\n"
00221    "\tjae mul8_loop%=\n"
00222 
00223 "mul8_skip%=:\n"
00224    "\taddps %%xmm5, %%xmm4\n"
00225 
00226 
00227    "\tadd $4, %%ecx\n"
00228    "\tjl mul4_skip%=\n"
00229 
00230    "\tmovups (%%eax), %%xmm0\n"
00231    "\tmovups (%%edi), %%xmm1\n"
00232    "\tmovups (%%esi), %%xmm6\n"
00233    "\tadd $16, %%eax\n"
00234    "\tadd $16, %%edi\n"
00235    "\tadd $16, %%esi\n"
00236 
00237    "\tsubps %%xmm0, %%xmm1\n"
00238    "\tmulps %%xmm1, %%xmm1\n"
00239    "\tmulps %%xmm6, %%xmm1\n"
00240    "\taddps %%xmm1, %%xmm4\n"
00241 
00242    "\tsub $4,  %%ecx\n"
00243 
00244 "mul4_skip%=:\n"
00245 
00246 
00247 
00248    "\tmovaps %%xmm4, %%xmm3\n"
00249 
00250 
00251    "\tmovhlps %%xmm3, %%xmm4\n"
00252    "\taddps %%xmm4, %%xmm3\n"
00253    "\tmovaps %%xmm3, %%xmm4\n"
00254    "\tshufps $33, %%xmm4, %%xmm4\n"
00255    "\taddss %%xmm4, %%xmm3\n"
00256    "\tmovss %%xmm3, (%%edx)\n"
00257 
00258 
00259    "\tpop %%ecx\n"
00260    "\tpop %%edi\n"
00261    "\tpop %%esi\n"
00262    "\tpop %%eax\n"
00263    "\temms\n"
00264    
00265    : : "a" (a), "S" (c), "D" (b), "c" (len), "d" (&sum)
00266    CLOBBER_SSE
00267    );
00268     
00269    return sum;
00270 }
00271 
00272 
00273 //WARNING:
00274 //FIXME: Does not work yet with lengths that are not a multiple of 4
00275 inline float vec_dist2_mul4_sse(const float *a, const float *b, int len)
00276 {
00277    float sum=0;
00278    __asm__ __volatile__ (
00279    
00280    "\tpush %%eax\n"
00281    "\tpush %%edi\n"
00282    "\tpush %%ecx\n"
00283    "\txorps %%xmm4, %%xmm4\n"
00284    "\txorps %%xmm5, %%xmm5\n"
00285 
00286    "\tsub $8, %%ecx\n"
00287    "j\tb mul8_skip%=\n"
00288 
00289 "mul8_loop%=:\n"
00290    "\tmovups (%%eax), %%xmm0\n"
00291    "\tmovups (%%edi), %%xmm1\n"
00292    "\tmovups 16(%%eax), %%xmm2\n"
00293    "\tmovups 16(%%edi), %%xmm3\n"
00294    "\tadd $32, %%eax\n"
00295    "\tadd $32, %%edi\n"
00296    "\tsubps %%xmm0, %%xmm1\n"
00297    "\tsubps %%xmm2, %%xmm3\n"
00298    "\tmulps %%xmm1, %%xmm1\n"
00299    "\tmulps %%xmm3, %%xmm3\n"
00300    "\taddps %%xmm1, %%xmm4\n"
00301    "\taddps %%xmm3, %%xmm5\n"
00302 
00303    "\tsub $8,  %%ecx\n"
00304    "\tjae mul8_loop%=\n"
00305 
00306 "mul8_skip%=:\n"
00307    "\taddps %%xmm5, %%xmm4\n"
00308 
00309 
00310    "\tadd $4, %%ecx\n"
00311    "\tjl mul4_skip%=\n"
00312 
00313    "\tmovups (%%eax), %%xmm0\n"
00314    "\tmovups (%%edi), %%xmm1\n"
00315    "\tadd $16, %%eax\n"
00316    "\tadd $16, %%edi\n"
00317 
00318    "\tsubps %%xmm0, %%xmm1\n"
00319    "\tmulps %%xmm1, %%xmm1\n"
00320    "\taddps %%xmm1, %%xmm4\n"
00321 
00322    "\tsub $4,  %%ecx\n"
00323 
00324 "mul4_skip%=:\n"
00325 
00326 
00327 
00328    "\tmovaps %%xmm4, %%xmm3\n"
00329 
00330 
00331    "\tmovhlps %%xmm3, %%xmm4\n"
00332    "\taddps %%xmm4, %%xmm3\n"
00333    "\tmovaps %%xmm3, %%xmm4\n"
00334    "\tshufps $33, %%xmm4, %%xmm4\n"
00335    "\taddss %%xmm4, %%xmm3\n"
00336    "\tmovss %%xmm3, (%%edx)\n"
00337 
00338 
00339    "\tpop %%ecx\n"
00340    "\tpop %%edi\n"
00341    "\tpop %%eax\n"
00342    "\temms\n"
00343    
00344    : : "a" (a), "D" (b), "c" (len), "d" (&sum)
00345    CLOBBER_SSE
00346    );
00347     
00348    return sum;
00349 }
00350 
00351 
00352 #else /* _ENABLE_SSE */
00353 
00354 
00355 #define ERROR_SSE_NI {throw new GeneralException("Trying to use SSE, but Overflow not compiled with _ENABLE_SSE. Bad, bad, this should never happen", __FILE__, __LINE__);}
00356 
00357 inline float vec_inner_prod_sse(const float *a, const float *b, int len)
00358   ERROR_SSE_NI
00359 
00360 inline float vec_mahalanobis2_mul4_sse(const float *a, const float *b, const float *c, int len)
00361   ERROR_SSE_NI
00362 
00363 inline float vec_dist2_mul4_sse(const float *a, const float *b, int len)
00364   ERROR_SSE_NI
00365 
00366 #endif /* !_ENABLE_SSE */
00367 
00368 }//namespace FD
00369 
00370 #endif