FlowDesigner: vec.h Source File

00001 // Copyright (C) 2001 Jean-Marc Valin
00002 
00003 
00004 //This file contains vector primitives and supports 3DNow! and SSE (incomplete) optimization
00005 #ifndef VEC_H
00006 #define VEC_H
00007 
00008 #include <math.h>
00009 #include "vec_sse.h"
00010 #include "vec_3dnow.h"
00011 #include "iextensions.h"
00012 
00013 namespace FD {
00014 /* Prefetch routines, may be used on any type (not only float and SSE2 double) */
00015 
00016 #ifdef _USE_3DNOW /*This means we're likely to have 64-byte cache lines (unless it's a K6-II)*/
00017 
00018 #ifndef CACHE_LINES
00019 #define CACHE_LINES 64
00020 #endif /*CACHE_LINES */
00021 
00022 #ifndef MAX_PREFETCH
00023 #define MAX_PREFETCH 8192  /* The Athlon L1 size is 64 kB */
00024 #endif /*MAX_PREFETCH*/
00025 
00026 #ifndef CACHE_MASK
00027 #define CACHE_MASK 0xffffffd0
00028 #endif /*CACHE_MASK */
00029 
00030 #else /*_USE_3DNOW*/
00031 
00032 #ifndef CACHE_LINES
00033 #define CACHE_LINES 32
00034 #endif /*CACHE_LINES */
00035 
00036 #ifndef CACHE_MASK
00037 #define CACHE_MASK 0xffffffe0
00038 #endif /*CACHE_MASK */
00039 
00040 #ifndef MAX_PREFETCH
00041 #define MAX_PREFETCH 4096  /* The PIII L1 size is 32 kB */
00042 #endif /*MAX_PREFETCH*/
00043 
00044 #endif
00045 
00046 
00047 
00048 /*template <class T>
00049 void vec_prefetchnta(T *a, int len)
00050 {
00051    int size=sizeof(T)*len;
00052    if (size > MAX_PREFETCH)
00053       return;
00054    __asm__ __volatile__ (
00055    "
00056    push %%eax
00057    push %%ecx
00058 
00059 prefetch_loop%=:
00060    prefetchnta (%%eax)
00061    add %%edi, %%eax
00062    sub %%edi, %%ecx
00063    ja prefetch_loop%=
00064 
00065    pop %%ecx
00066    pop %%eax
00067    "
00068    : : "a" (a), "D" (CACHE_LINES), "c" (size)
00069    :
00070    );
00071 }
00072 */
00073 
00074 
00075 
00076 
00077 template <class T>
00078 inline void vec_copy(const T *x, T *y, int len)
00079 {
00080    while (len > 3)
00081    {
00082       *y++ = *x++;
00083       *y++ = *x++;
00084       *y++ = *x++;
00085       *y++ = *x++;
00086       len -= 4;
00087    }
00088    while (len)
00089    {
00090       *y++ = *x++;
00091       len--;
00092    }
00093 }
00094 
00095 
00096 template <class T>
00097 inline T vec_inner_prod(const T *a, const T *b, int len)
00098 {
00099   T sum1=0, sum2=0, sum3=0, sum4=0;
00100   const T *end = a+len;
00101   while (a<end-3)
00102     {
00103       sum1+=a[0]*b[0];
00104       sum2+=a[1]*b[1];
00105       sum3+=a[2]*b[2];
00106       sum4+=a[3]*b[3];
00107       a+=4;
00108       b+=4;
00109     }
00110   while (a<end)
00111     {
00112       sum1+=a[0]*b[0];
00113       a++; b++;
00114     }
00115   return (sum1+sum2)+(sum3+sum4);
00116 }
00117 
00118 template <class T>
00119 inline void vec_add_vec(const T *a, const T *b, T *c, int len)
00120 {
00121   const T *end = a+len;
00122   while (a<end-3)
00123     {
00124       c[0]=a[0]+b[0];
00125       c[1]=a[1]+b[1];
00126       c[2]=a[2]+b[2];
00127       c[3]=a[3]+b[3];
00128       a+=4;
00129       b+=4;
00130       c+=4;
00131     }
00132   while (a<end)
00133     {
00134       c[0]=a[0]+b[0];
00135       a++; b++; c++;
00136     }
00137 }
00138 
00139 template <class T>
00140 inline void vec_sub_vec(const T *a, const T *b, T *c, int len)
00141 {
00142   const T *end = a+len;
00143   while (a<end-3)
00144     {
00145       c[0]=a[0]-b[0];
00146       c[1]=a[1]-b[1];
00147       c[2]=a[2]-b[2];
00148       c[3]=a[3]-b[3];
00149       a+=4;
00150       b+=4;
00151       c+=4;
00152     }
00153   while (a<end)
00154     {
00155       c[0]=a[0]-b[0];
00156       a++; b++; c++;
00157     }
00158 }
00159 
00160 template <class T>
00161 inline void vec_mul_vec(const T *a, const T *b, T *c, int len)
00162 {
00163   const T *end = a+len;
00164   while (a<end-3)
00165     {
00166       c[0]=a[0]*b[0];
00167       c[1]=a[1]*b[1];
00168       c[2]=a[2]*b[2];
00169       c[3]=a[3]*b[3];
00170       a+=4;
00171       b+=4;
00172       c+=4;
00173     }
00174   while (a<end)
00175     {
00176       c[0]=a[0]*b[0];
00177       a++; b++; c++;
00178     }
00179 }
00180 
00181 template <class T>
00182 inline void vec_mul_and_add(const T *a, const T *b, T *c, int len)
00183 {
00184   const T *end = a+len;
00185   while (a<end-3)
00186     {
00187       c[0]+=a[0]*b[0];
00188       c[1]+=a[1]*b[1];
00189       c[2]+=a[2]*b[2];
00190       c[3]+=a[3]*b[3];
00191       a+=4;
00192       b+=4;
00193       c+=4;
00194     }
00195   while (a<end)
00196     {
00197       c[0]+=a[0]*b[0];
00198       a++; b++; c++;
00199     }
00200 }
00201 
00202 template <class T>
00203 inline void vec_mul_and_add(const T a, const T *b, T *c, int len)
00204 {
00205   const T *end = b+len;
00206   while (b<end-3)
00207     {
00208       c[0]+=a*b[0];
00209       c[1]+=a*b[1];
00210       c[2]+=a*b[2];
00211       c[3]+=a*b[3];
00212       b+=4;
00213       c+=4;
00214     }
00215   while (b<end)
00216     {
00217       c[0]+=a*b[0];
00218       b++; c++;
00219     }
00220 }
00221 
00222 template <class T>
00223 inline void vec_div_vec(const T *a, const T *b, T *c, int len)
00224 {
00225   const T *end = a+len;
00226   while (a<end-3)
00227     {
00228       c[0]=a[0]/b[0];
00229       c[1]=a[1]/b[1];
00230       c[2]=a[2]/b[2];
00231       c[3]=a[3]/b[3];
00232       a+=4;
00233       b+=4;
00234       c+=4;
00235     }
00236   while (a<end)
00237     {
00238       c[0]=a[0]/b[0];
00239       a++; b++; c++;
00240     }
00241 }
00242 
00243 
00244 template <class T>
00245 inline void vec_add_scal(const T a, const T *b, T *c, int len)
00246 {
00247   const T *end = b+len;
00248   while (b<end-3)
00249     {
00250       c[0]=a+b[0];
00251       c[1]=a+b[1];
00252       c[2]=a+b[2];
00253       c[3]=a+b[3];
00254       b+=4;
00255       c+=4;
00256     }
00257   while (b<end)
00258     {
00259       c[0]=a+b[0];
00260       b++; c++;
00261     }
00262 }
00263 
00264 template <class T>
00265 inline void vec_mul_scal(const T a, const T *b, T *c, int len)
00266 {
00267   const T *end = b+len;
00268   while (b<end-3)
00269     {
00270       c[0]=a*b[0];
00271       c[1]=a*b[1];
00272       c[2]=a*b[2];
00273       c[3]=a*b[3];
00274       b+=4;
00275       c+=4;
00276     }
00277   while (b<end)
00278     {
00279       c[0]=a*b[0];
00280       b++; c++;
00281     }
00282 }
00283 
00284 template <class T>
00285 inline T vec_dist2(const T *a, const T *b, int len)
00286 {
00287   T sum1=0, sum2=0, sum3=0, sum4=0;
00288   const T *end = a+len;
00289   while (a<end-3)
00290     {
00291       sum1+=(a[0]-b[0])*(a[0]-b[0]);
00292       sum2+=(a[1]-b[1])*(a[1]-b[1]);
00293       sum3+=(a[2]-b[2])*(a[2]-b[2]);
00294       sum4+=(a[3]-b[3])*(a[3]-b[3]);
00295       a+=4;
00296       b+=4;
00297     }
00298   while (a<end)
00299     {
00300       sum1+=(a[0]-b[0])*(a[0]-b[0]);
00301       a++; b++;
00302     }
00303   return (sum1+sum2)+(sum3+sum4);
00304 }
00305 
00306 template <class T>
00307 inline T vec_mahalanobis2(const T *a, const T *b, const T *c, int len)
00308 {
00309   T sum1=0, sum2=0, sum3=0, sum4=0;
00310   const T *end = a+len;
00311   while (a<end-3)
00312     {
00313       sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]);
00314       sum2+=c[1]*(a[1]-b[1])*(a[1]-b[1]);
00315       sum3+=c[2]*(a[2]-b[2])*(a[2]-b[2]);
00316       sum4+=c[3]*(a[3]-b[3])*(a[3]-b[3]);
00317       a+=4;
00318       b+=4;
00319       c+=4;
00320     }
00321   while (a<end)
00322     {
00323       sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]);
00324       a++; b++; c++;
00325     }
00326   return (sum1+sum2)+(sum3+sum4);
00327 }
00328 
00329 template <class T>
00330 inline T vec_sum(const T *a, int len)
00331 {
00332   T sum1=0, sum2=0, sum3=0, sum4=0;
00333   const T *end = a+len;
00334   while (a<end-3)
00335     {
00336       sum1+=a[0];
00337       sum2+=a[1];
00338       sum3+=a[2];
00339       sum4+=a[3];
00340       a+=4;
00341     }
00342   while (a<end)
00343     {
00344       sum1+=a[0];
00345       a++;
00346     }
00347   return (sum1+sum2)+(sum3+sum4);
00348 }
00349 
00350 template <class T>
00351 inline T vec_norm2(const T *a, int len)
00352 {
00353   T sum1=0, sum2=0, sum3=0, sum4=0;
00354   const T *end = a+len;
00355   while (a<end-3)
00356     {
00357       sum1+=a[0]*a[0];
00358       sum2+=a[1]*a[1];
00359       sum3+=a[2]*a[2];
00360       sum4+=a[3]*a[4];
00361       a+=4;
00362     }
00363   while (a<end)
00364     {
00365       sum1+=a[0]*a[0];
00366       a++;
00367     }
00368   return (sum1+sum2)+(sum3+sum4);
00369 }
00370 
00371 template <class T>
00372 inline void vec_inv(const T *a, T *b, int len)
00373 {
00374   const T *end = a+len;
00375   while (a<end-3)
00376     {
00377       b[0]=1/a[0];
00378       b[1]=1/a[1];
00379       b[2]=1/a[2];
00380       b[3]=1/a[3];
00381       a+=4; b+=4;
00382     }
00383   while (a<end)
00384     {
00385       b[0]=1/a[0];
00386       a++; b++;
00387     }
00388 }
00389 
00390 template <class T>
00391 inline void vec_sqrt(const T *a, T *b, int len)
00392 {
00393   const T *end = a+len;
00394   while (a<end-3)
00395     {
00396       b[0]=sqrt(a[0]);
00397       b[1]=sqrt(a[1]);
00398       b[2]=sqrt(a[2]);
00399       b[3]=sqrt(a[3]);
00400       a+=4; b+=4;
00401     }
00402   while (a<end)
00403     {
00404       b[0]=sqrt(a[0]);
00405       a++; b++;
00406     }
00407 }
00408 
00409 template <class T>
00410 inline void vec_corr_cont(const T *a, T *filt, T *out, int len, int filtLen)
00411 {
00412    for (int i=0;i<len;i++)
00413       out[i] = vec_inner_prod(a-filtLen+1, filt, filtLen);
00414 }
00415 
00416 template <class T>
00417 inline void vec_conv_cont(const T *a, T *filt, T *out, int len, int filtLen)
00418 {
00419    T filt2[filtLen];
00420    for (int i=0;i<filtLen;i++)
00421       filt2[i] = filt[filtLen-i-1];
00422    for (int i=0;i<len;i++)
00423       out[i] = vec_inner_prod(a-filtLen+1, filt2, filtLen);
00424 }
00425 
00426 
00427 
00428 
00429 /*Here are float (REAL*4) specific versions of the routines */
00430 
00431 
00432 inline float vec_inner_prod_float(const float *a, const float *b, int len)
00433 {
00434    float sum1=0, sum2=0, sum3=0, sum4=0;
00435    const float *end = a+len;
00436    while (a<end-3)
00437    {
00438       sum1+=a[0]*b[0];
00439       sum2+=a[1]*b[1];
00440       sum3+=a[2]*b[2];
00441       sum4+=a[3]*b[3];
00442       a+=4;
00443       b+=4;
00444    }
00445    while (a<end)
00446    {
00447       sum1+=a[0]*b[0];
00448       a++; b++;
00449    }
00450    return (sum1+sum2)+(sum3+sum4);
00451 }
00452 
00453 template <>
00454 inline float vec_inner_prod<float>(const float *a, const float *b, int len)
00455 {
00456 #ifndef WIN32
00457 
00458 #ifdef _ENABLE_3DNOW
00459   if (len >= 8 && IExtensions::have3DNow()) 
00460       return vec_inner_prod_3dnow(a,b,len);   
00461   else 
00462 #endif
00463 #ifdef _ENABLE_SSE
00464     if (len >=8 && IExtensions::haveSSE()) 
00465       return vec_inner_prod_sse(a,b,len);
00466     else 
00467 #endif
00468 #endif
00469       return vec_inner_prod_float(a,b,len);
00470 }
00471 
00472 
00473 }//namespace FD
00474 
00475 #endif /* ifndef VEC_H*/