00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/SimpleFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 CKMeans::CKMeans()
00028 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00029 mus(NULL), Weights(NULL)
00030 {
00031 }
00032
00033 CKMeans::CKMeans(int32_t k_, CDistance* d)
00034 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00035 mus(NULL), Weights(NULL)
00036 {
00037 set_distance(d);
00038 }
00039
00040 CKMeans::~CKMeans()
00041 {
00042 delete[] R;
00043 delete[] mus;
00044 }
00045
00046 bool CKMeans::train()
00047 {
00048 ASSERT(distance);
00049 ASSERT(distance->get_feature_type()==F_DREAL);
00050 ASSERT(distance->get_distance_type()==D_EUCLIDIAN);
00051 CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs();
00052 ASSERT(lhs);
00053 int32_t num=lhs->get_num_vectors();
00054 SG_UNREF(lhs);
00055
00056 Weights=new float64_t[num];
00057 for (int32_t i=0; i<num; i++)
00058 Weights[i]=1.0;
00059
00060 clustknb(false, NULL);
00061 delete[] Weights;
00062
00063 return true;
00064 }
00065
00066 bool CKMeans::load(FILE* srcfile)
00067 {
00068 return false;
00069 }
00070
00071 bool CKMeans::save(FILE* dstfile)
00072 {
00073 return false;
00074 }
00075
00076 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00077 struct thread_data
00078 {
00079 float64_t* x;
00080 CSimpleFeatures<float64_t>* y;
00081 float64_t* z;
00082 int32_t n1, n2, m;
00083 int32_t js, je;
00084 int32_t offs;
00085 };
00086 #endif // DOXYGEN_SHOULD_SKIP_THIS
00087
00088 void *sqdist_thread_func(void * P)
00089 {
00090 struct thread_data *TD=(struct thread_data*) P;
00091 float64_t* x=TD->x;
00092 CSimpleFeatures<float64_t>* y=TD->y;
00093 float64_t* z=TD->z;
00094 int32_t n1=TD->n1,
00095 m=TD->m,
00096 js=TD->js,
00097 je=TD->je,
00098 offs=TD->offs,
00099 j,i,k;
00100
00101 for (j=js; j<je; j++)
00102 {
00103 int32_t vlen=0;
00104 bool vfree=false;
00105 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00106
00107 for (i=0; i<n1; i++)
00108 {
00109 float64_t sum=0;
00110 for (k=0; k<m; k++)
00111 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00112 z[j*n1 + i] = sum;
00113 }
00114
00115 y->free_feature_vector(vec, j, vfree);
00116 }
00117 return NULL;
00118 }
00119
00120 void CKMeans::sqdist(
00121 float64_t* x, CSimpleFeatures<float64_t>* y, float64_t* z, int32_t n1, int32_t offs,
00122 int32_t n2, int32_t m)
00123 {
00124 const int32_t num_threads=parallel->get_num_threads();
00125 int32_t nc, n2_nc = n2/num_threads;
00126 thread_data* TD = new thread_data[num_threads];
00127 pthread_t* tid = new pthread_t[num_threads];
00128 void *status;
00129
00130
00131 TD[0].x=x ; TD[0].y=y ; TD[0].z=z ;
00132 TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00133 TD[0].offs=offs;
00134
00135 if (n2>PAR_THRESH)
00136 {
00137 ASSERT(PAR_THRESH>1);
00138
00139
00140 for (nc=0; nc<num_threads; nc++)
00141 {
00142 TD[nc]=TD[0];
00143 TD[nc].js=nc*n2_nc;
00144 if (nc+1==num_threads)
00145 TD[nc].je=n2;
00146 else
00147 TD[nc].je=(nc+1)*n2_nc;
00148
00149 pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00150 }
00151
00152
00153 for (nc=0; nc<num_threads; nc++)
00154 pthread_join(tid[nc], &status);
00155
00156 }
00157 else
00158 {
00159
00160 TD[0].js=0 ; TD[0].je=n2;
00161 sqdist_thread_func((void *)&TD[0]);
00162 }
00163
00164 delete[] tid;
00165 delete[] TD;
00166 }
00167
00168 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00169 {
00170 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00171 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00172 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00173
00174 int32_t XSize=lhs->get_num_vectors();
00175 dimensions=lhs->get_num_features();
00176 int32_t i, changed=1;
00177 const int32_t XDimk=dimensions*k;
00178 int32_t iter=0;
00179
00180 delete[] R;
00181 R=new float64_t[k];
00182
00183 delete[] mus;
00184 mus=new float64_t[XDimk];
00185
00186 int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00187 float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00188 float64_t *oldmus = (float64_t*) calloc(XDimk, sizeof(float64_t));
00189 float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00190
00191 int32_t vlen=0;
00192 bool vfree=false;
00193 float64_t* vec=NULL;
00194
00195
00196 for (i=0; i<XSize; i++) ClList[i]=0;
00197
00198 for (i=0; i<k; i++) weights_set[i]=0;
00199
00200
00201 for (i=0; i<XDimk; i++) mus[i]=0;
00202
00203 if (!use_old_mus)
00204 {
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223 for (i=0; i<XSize; i++)
00224 {
00225 const int32_t Cl=CMath::random(0, k-1);
00226 int32_t j;
00227 float64_t weight=Weights[i];
00228
00229 weights_set[Cl]+=weight;
00230 ClList[i]=Cl;
00231
00232 vec=lhs->get_feature_vector(i, vlen, vfree);
00233
00234 for (j=0; j<dimensions; j++)
00235 mus[Cl*dimensions+j] += weight*vec[j];
00236
00237 lhs->free_feature_vector(vec, i, vfree);
00238 }
00239 for (i=0; i<k; i++)
00240 {
00241 int32_t j;
00242
00243 if (weights_set[i]!=0.0)
00244 for (j=0; j<dimensions; j++)
00245 mus[i*dimensions+j] /= weights_set[i];
00246 }
00247 }
00248 else
00249 {
00250 ASSERT(mus_start);
00251
00252 sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions);
00253
00254 for (i=0; i<XSize; i++)
00255 {
00256 float64_t mini=dists[i*k];
00257 int32_t Cl = 0, j;
00258
00259 for (j=1; j<k; j++)
00260 {
00261 if (dists[i*k+j]<mini)
00262 {
00263 Cl=j;
00264 mini=dists[i*k+j];
00265 }
00266 }
00267 ClList[i]=Cl;
00268 }
00269
00270
00271
00272 for (i=0; i<XSize; i++)
00273 {
00274 const int32_t Cl = ClList[i];
00275 float64_t weight=Weights[i];
00276 weights_set[Cl]+=weight;
00277 #ifndef MUSRECALC
00278 vec=lhs->get_feature_vector(i, vlen, vfree);
00279
00280 for (j=0; j<dimensions; j++)
00281 mus[Cl*dimensions+j] += weight*vec[j];
00282
00283 lhs->free_feature_vector(vec, i, vfree);
00284 #endif
00285 }
00286 #ifndef MUSRECALC
00287
00288 for (i=0; i<k; i++)
00289 {
00290 if (weights_set[i]!=0.0)
00291 {
00292 int32_t j;
00293 for (j=0; j<dimensions; j++)
00294 mus[i*dimensions+j] /= weights_set[i];
00295 }
00296 }
00297 #endif
00298 }
00299
00300 for (i=0; i<XDimk; i++) oldmus[i]=-1;
00301
00302 while (changed && (iter<max_iter))
00303 {
00304 iter++;
00305 if (iter==max_iter-1)
00306 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00307
00308 if (iter%1000 == 0)
00309 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00310 changed=0;
00311
00312 #ifdef MUSRECALC
00313
00314 for (i=0; i<XDimk; i++) mus[i]=0;
00315
00316 for (i=0; i<XSize; i++)
00317 {
00318 int32_t j;
00319 int32_t Cl=ClList[i];
00320 float64_t weight=Weights[i];
00321
00322 vec=lhs->get_feature_vector(i, vlen, vfree);
00323
00324 for (j=0; j<dimensions; j++)
00325 mus[Cl*dimensions+j] += weight*vec[j];
00326
00327 lhs->free_feature_vector(vec, i, vfree);
00328 }
00329 for (i=0; i<k; i++)
00330 {
00331 int32_t j;
00332
00333 if (weights_set[i]!=0.0)
00334 for (j=0; j<dimensions; j++)
00335 mus[i*dimensions+j] /= weights_set[i];
00336 }
00337 #endif
00338
00339 for (i=0; i<XSize; i++)
00340 {
00341
00342 const int32_t Pat= CMath::random(0, XSize-1);
00343 const int32_t ClList_Pat=ClList[Pat];
00344 int32_t imini, j;
00345 float64_t mini, weight;
00346
00347 weight=Weights[Pat];
00348
00349
00350
00351 sqdist(mus, lhs, dists, k, Pat, 1, dimensions);
00352
00353
00354 imini=0 ; mini=dists[0];
00355 for (j=1; j<k; j++)
00356 if (dists[j]<mini)
00357 {
00358 mini=dists[j];
00359 imini=j;
00360 }
00361
00362 if (imini!=ClList_Pat)
00363 {
00364 changed= changed + 1;
00365
00366
00367 weights_set[imini]+= weight;
00368
00369 weights_set[ClList_Pat]-= weight;
00370
00371
00372
00373 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00374
00375 for (j=0; j<dimensions; j++)
00376 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00377
00378 lhs->free_feature_vector(vec, Pat, vfree);
00379
00380
00381
00382 if (weights_set[ClList_Pat]!=0.0)
00383 {
00384
00385 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00386
00387 for (j=0; j<dimensions; j++)
00388 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00389 lhs->free_feature_vector(vec, Pat, vfree);
00390 }
00391 else
00392
00393 for (j=0; j<dimensions; j++)
00394 mus[ClList_Pat*dimensions+j]=0;
00395
00396
00397 ClList[Pat] = imini;
00398 }
00399 }
00400 }
00401
00402
00403 for (i=0; i<k; i++)
00404 {
00405 float64_t rmin1=0;
00406 float64_t rmin2=0;
00407
00408 bool first_round=true;
00409
00410 for (int32_t j=0; j<k; j++)
00411 {
00412 if (j!=i)
00413 {
00414 int32_t l;
00415 float64_t dist = 0;
00416
00417 for (l=0; l<dimensions; l++)
00418 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00419
00420 if (first_round)
00421 {
00422 rmin1=dist;
00423 rmin2=dist;
00424 first_round=false;
00425 }
00426 else
00427 {
00428 if ((dist<rmin2) && (dist>=rmin1))
00429 rmin2=dist;
00430
00431 if (dist<rmin1)
00432 {
00433 rmin2=rmin1;
00434 rmin1=dist;
00435 }
00436 }
00437 }
00438 }
00439
00440 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00441 }
00442
00443 free(ClList);
00444 free(weights_set);
00445 free(oldmus);
00446 free(dists);
00447 SG_UNREF(lhs);
00448 }