00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/config.h"
00013 #include "lib/Mathematics.h"
00014
00015 #include <string.h>
00016 #include <stdlib.h>
00017
00018 #ifdef HAVE_LAPACK
00019 #include "lib/lapack.h"
00020
00021 #include "lib/common.h"
00022 #include "preproc/PCACut.h"
00023 #include "preproc/SimplePreProc.h"
00024 #include "features/Features.h"
00025 #include "features/SimpleFeatures.h"
00026 #include "lib/io.h"
00027
00028 CPCACut::CPCACut(int32_t do_whitening_, float64_t thresh_)
00029 : CSimplePreProc<float64_t>("PCACut", "PCAC"), T(NULL), num_dim(0), mean(NULL),
00030 initialized(false), do_whitening(do_whitening_), thresh(thresh_)
00031 {
00032 }
00033
00034 CPCACut::~CPCACut()
00035 {
00036 delete[] T;
00037 delete[] mean;
00038 }
00039
00041 bool CPCACut::init(CFeatures* f)
00042 {
00043 if (!initialized)
00044 {
00045 ASSERT(f->get_feature_class()==C_SIMPLE);
00046 ASSERT(f->get_feature_type()==F_DREAL);
00047
00048 SG_INFO("calling CPCACut::init\n") ;
00049 int32_t num_vectors=((CSimpleFeatures<float64_t>*)f)->get_num_vectors() ;
00050 int32_t num_features=((CSimpleFeatures<float64_t>*)f)->get_num_features() ;
00051 SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00052 delete[] mean ;
00053 mean=new float64_t[num_features+1] ;
00054
00055 int32_t i,j;
00056
00058
00059
00060 for (j=0; j<num_features; j++)
00061 mean[j]=0 ;
00062
00063
00064 for (i=0; i<num_vectors; i++)
00065 {
00066 int32_t len;
00067 bool free;
00068 float64_t* vec=((CSimpleFeatures<float64_t>*) f)->get_feature_vector(i, len, free);
00069 for (j=0; j<num_features; j++)
00070 mean[j]+= vec[j];
00071
00072 ((CSimpleFeatures<float64_t>*) f)->free_feature_vector(vec, i, free);
00073 }
00074
00075
00076 for (j=0; j<num_features; j++)
00077 mean[j]/=num_vectors;
00078
00079 SG_DONE();
00080 SG_DEBUG("Computing covariance matrix... of size %.2f M\n", num_features*num_features/1024.0/1024.0);
00081 float64_t *cov=new float64_t[num_features*num_features];
00082
00083 for (j=0; j<num_features*num_features; j++)
00084 cov[j]=0.0 ;
00085
00086 for (i=0; i<num_vectors; i++)
00087 {
00088 if (!(i % (num_vectors/10+1)))
00089 SG_PROGRESS(i, 0, num_vectors);
00090
00091 int32_t len;
00092 bool free;
00093
00094 float64_t* vec=((CSimpleFeatures<float64_t>*) f)->get_feature_vector(i, len, free) ;
00095
00096 for (int32_t jj=0; jj<num_features; jj++)
00097 vec[jj]-=mean[jj] ;
00098
00100 int nf = (int) num_features;
00101 double* vec_double = (double*) vec;
00102 cblas_dger(CblasColMajor, nf, nf, 1.0, vec_double, 1, vec_double,
00103 1, (double*) cov, nf);
00104
00105
00106
00107
00108
00109 ((CSimpleFeatures<float64_t>*) f)->free_feature_vector(vec, i, free) ;
00110 }
00111
00112 SG_DONE();
00113
00114 for (i=0; i<num_features; i++)
00115 for (j=0; j<num_features; j++)
00116 cov[i*num_features+j]/=num_vectors ;
00117
00118 SG_DONE();
00119
00120 SG_INFO("Computing Eigenvalues ... ") ;
00121 char V='V';
00122 char U='U';
00123 int32_t info;
00124 int32_t ord= num_features;
00125 int32_t lda= num_features;
00126 float64_t* eigenvalues=new float64_t[num_features] ;
00127
00128 for (i=0; i<num_features; i++)
00129 eigenvalues[i]=0;
00130
00131
00132 wrap_dsyev(V, U, (int) ord, (double*) cov, (int) lda,
00133 (double*) eigenvalues, (int*) &info);
00134
00135
00136 num_dim=0;
00137 for (i=0; i<num_features; i++)
00138 {
00139
00140 if (eigenvalues[i]>thresh)
00141 num_dim++ ;
00142 } ;
00143
00144 SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00145
00146 delete[] T;
00147 T=new float64_t[num_dim*num_features];
00148 num_old_dim=num_features;
00149
00150 if (do_whitening)
00151 {
00152 int32_t offs=0 ;
00153 for (i=0; i<num_features; i++)
00154 {
00155 if (eigenvalues[i]>thresh)
00156 {
00157 for (int32_t jj=0; jj<num_features; jj++)
00158 T[offs+jj*num_dim]=cov[num_features*i+jj]/sqrt(eigenvalues[i]) ;
00159 offs++ ;
00160 } ;
00161 }
00162 } ;
00163
00164 delete[] eigenvalues;
00165 delete[] cov;
00166 initialized=true;
00167 SG_INFO("Done\n") ;
00168 return true ;
00169 }
00170 return
00171 false;
00172 }
00173
00175 void CPCACut::cleanup()
00176 {
00177 delete[] T ;
00178 T=NULL ;
00179 }
00180
00184 float64_t* CPCACut::apply_to_feature_matrix(CFeatures* f)
00185 {
00186 int32_t num_vectors=0;
00187 int32_t num_features=0;
00188
00189 float64_t* m=((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors);
00190 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features) ;
00191
00192 if (m)
00193 {
00194 SG_INFO("Preprocessing feature matrix\n");
00195 float64_t* res= new float64_t[num_dim];
00196 float64_t* sub_mean= new float64_t[num_features];
00197
00198 for (int32_t vec=0; vec<num_vectors; vec++)
00199 {
00200 int32_t i;
00201
00202 for (i=0; i<num_features; i++)
00203 sub_mean[i]=m[num_features*vec+i]-mean[i] ;
00204
00205 int nd = (int) num_dim;
00206 cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) num_features,
00207 1.0, T, nd, (double*) sub_mean, 1, 0, (double*) res, 1);
00208
00209 float64_t* m_transformed=&m[num_dim*vec];
00210 for (i=0; i<num_dim; i++)
00211 m_transformed[i]=m[i];
00212 }
00213 delete[] res;
00214 delete[] sub_mean;
00215
00216 ((CSimpleFeatures<float64_t>*) f)->set_num_features(num_dim);
00217 ((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors);
00218 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00219 }
00220
00221 return m;
00222 }
00223
00226 float64_t* CPCACut::apply_to_feature_vector(float64_t* f, int32_t &len)
00227 {
00228 float64_t *ret=new float64_t[num_dim];
00229 float64_t *sub_mean=new float64_t[len];
00230 for (int32_t i=0; i<len; i++)
00231 sub_mean[i]=f[i]-mean[i];
00232
00233 int nd = (int) num_dim;
00234 cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) len, 1.0, (double*) T,
00235 nd, (double*) sub_mean, 1, 0, (double*) ret, 1);
00236
00237 delete[] sub_mean ;
00238 len=num_dim ;
00239
00240 return ret;
00241 }
00242
00244 bool CPCACut::load_init_data(FILE* src)
00245 {
00246 ASSERT(fread(&num_dim, sizeof(int), 1, src)==1);
00247 ASSERT(fread(&num_old_dim, sizeof(int), 1, src)==1);
00248 delete[] mean;
00249 delete[] T;
00250 mean=new float64_t[num_dim];
00251 T=new float64_t[num_dim*num_old_dim];
00252 ASSERT (mean!=NULL && T!=NULL);
00253 ASSERT(fread(mean, sizeof(float64_t), num_old_dim, src)==(uint32_t) num_old_dim);
00254 ASSERT(fread(T, sizeof(float64_t), num_dim*num_old_dim, src)==(uint32_t) num_old_dim*num_dim);
00255 return true;
00256 }
00257
00259 bool CPCACut::save_init_data(FILE* dst)
00260 {
00261 ASSERT(fwrite(&num_dim, sizeof(int), 1, dst)==1);
00262 ASSERT(fwrite(&num_old_dim, sizeof(int), 1, dst)==1);
00263 ASSERT(fwrite(mean, sizeof(float64_t), num_old_dim, dst)==(uint32_t) num_old_dim);
00264 ASSERT(fwrite(T, sizeof(float64_t), num_dim*num_old_dim, dst)==(uint32_t) num_old_dim*num_dim);
00265 return true;
00266 }
00267 #endif // HAVE_LAPACK