KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/SimpleFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018 
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 CKMeans::CKMeans()
00028 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00029     mus(NULL), Weights(NULL)
00030 {
00031 }
00032 
00033 CKMeans::CKMeans(int32_t k_, CDistance* d)
00034 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00035     mus(NULL), Weights(NULL)
00036 {
00037     set_distance(d);
00038 }
00039 
00040 CKMeans::~CKMeans()
00041 {
00042     delete[] R;
00043     delete[] mus;
00044 }
00045 
00046 bool CKMeans::train()
00047 {
00048     ASSERT(distance);
00049     ASSERT(distance->get_feature_type()==F_DREAL);
00050     ASSERT(distance->get_distance_type()==D_EUCLIDIAN);
00051     CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs();
00052     ASSERT(lhs);
00053     int32_t num=lhs->get_num_vectors();
00054     SG_UNREF(lhs);
00055 
00056     Weights=new float64_t[num];
00057     for (int32_t i=0; i<num; i++)
00058         Weights[i]=1.0;
00059 
00060     clustknb(false, NULL);
00061     delete[] Weights;
00062 
00063     return true;
00064 }
00065 
00066 bool CKMeans::load(FILE* srcfile)
00067 {
00068     return false;
00069 }
00070 
00071 bool CKMeans::save(FILE* dstfile)
00072 {
00073     return false;
00074 }
00075 
00076 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00077 struct thread_data
00078 {
00079     float64_t* x;
00080     CSimpleFeatures<float64_t>* y;
00081     float64_t* z;
00082     int32_t n1, n2, m;
00083     int32_t js, je; /* defines the matrix stripe */
00084     int32_t offs;
00085 };
00086 #endif // DOXYGEN_SHOULD_SKIP_THIS
00087 
00088 void *sqdist_thread_func(void * P)
00089 {
00090     struct thread_data *TD=(struct thread_data*) P;
00091     float64_t* x=TD->x;
00092     CSimpleFeatures<float64_t>* y=TD->y;
00093     float64_t* z=TD->z;
00094     int32_t n1=TD->n1,
00095         m=TD->m,
00096         js=TD->js,
00097         je=TD->je,
00098         offs=TD->offs,
00099         j,i,k;
00100 
00101     for (j=js; j<je; j++)
00102     {
00103         int32_t vlen=0;
00104         bool vfree=false;
00105         float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00106 
00107         for (i=0; i<n1; i++)
00108         {
00109             float64_t sum=0;
00110             for (k=0; k<m; k++) 
00111                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00112             z[j*n1 + i] = sum;
00113         }
00114 
00115         y->free_feature_vector(vec, j, vfree);
00116     }
00117     return NULL;
00118 } 
00119 
00120 void CKMeans::sqdist(
00121     float64_t* x, CSimpleFeatures<float64_t>* y, float64_t* z, int32_t n1, int32_t offs,
00122     int32_t n2, int32_t m)
00123 {
00124     const int32_t num_threads=parallel->get_num_threads();
00125     int32_t nc, n2_nc = n2/num_threads;
00126     thread_data* TD = new thread_data[num_threads];
00127     pthread_t* tid = new pthread_t[num_threads];
00128     void *status;
00129 
00130     /* prepare the structure */
00131     TD[0].x=x ; TD[0].y=y ; TD[0].z=z ; 
00132     TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00133     TD[0].offs=offs;
00134 
00135     if (n2>PAR_THRESH)
00136     {
00137         ASSERT(PAR_THRESH>1);
00138 
00139         /* create the threads */
00140         for (nc=0; nc<num_threads; nc++)
00141         {
00142             TD[nc]=TD[0];
00143             TD[nc].js=nc*n2_nc;
00144             if (nc+1==num_threads)
00145                 TD[nc].je=n2;
00146             else
00147                 TD[nc].je=(nc+1)*n2_nc;
00148 
00149             pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00150         }
00151 
00152         /* wait for finishing all threads */
00153         for (nc=0; nc<num_threads; nc++)
00154             pthread_join(tid[nc], &status);
00155 
00156     }
00157     else
00158     {
00159         /* simply call the ,,thread''-function */
00160         TD[0].js=0 ; TD[0].je=n2;
00161         sqdist_thread_func((void *)&TD[0]);
00162     }
00163 
00164     delete[] tid;
00165     delete[] TD;
00166 }
00167 
00168 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00169 {
00170     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00171     CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00172     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00173 
00174     int32_t XSize=lhs->get_num_vectors();
00175     dimensions=lhs->get_num_features();
00176     int32_t i, changed=1;
00177     const int32_t XDimk=dimensions*k;
00178     int32_t iter=0;
00179 
00180     delete[] R;
00181     R=new float64_t[k];
00182 
00183     delete[] mus;
00184     mus=new float64_t[XDimk];
00185 
00186     int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00187     float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00188     float64_t *oldmus = (float64_t*) calloc(XDimk, sizeof(float64_t));
00189     float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00190 
00191     int32_t vlen=0;
00192     bool vfree=false;
00193     float64_t* vec=NULL;
00194 
00195     /* ClList=zeros(XSize,1) ; */
00196     for (i=0; i<XSize; i++) ClList[i]=0;
00197     /* weights_set=zeros(k,1) ; */
00198     for (i=0; i<k; i++) weights_set[i]=0;
00199 
00200     /* mus=zeros(dimensions, k) ; */
00201     for (i=0; i<XDimk; i++) mus[i]=0;
00202 
00203     if (!use_old_mus)
00204     {
00205         /* random clustering (select random subsets) */
00206         /*  ks=ceil(rand(1,XSize)*k);
00207          *  for i=1:k,
00208          *  actks= (ks==i);
00209          *  c=sum(actks);
00210          *  weights_set(i)=c;
00211          *
00212          *  ClList(actks)=i*ones(1, c);
00213          *
00214          *  if ~mus_recalc,
00215          *      if c>1
00216          *          mus(:,i) = sum(XData(:,actks)')'/c;
00217          *      elseif c>0
00218          *          mus(:,i) = XData(:,actks);
00219          *      end;
00220          *  end;
00221          *   end ; */
00222 
00223         for (i=0; i<XSize; i++) 
00224         {
00225             const int32_t Cl=CMath::random(0, k-1);
00226             int32_t j;
00227             float64_t weight=Weights[i];
00228 
00229             weights_set[Cl]+=weight;
00230             ClList[i]=Cl;
00231 
00232             vec=lhs->get_feature_vector(i, vlen, vfree);
00233 
00234             for (j=0; j<dimensions; j++)
00235                 mus[Cl*dimensions+j] += weight*vec[j];
00236 
00237             lhs->free_feature_vector(vec, i, vfree);
00238         }
00239         for (i=0; i<k; i++)
00240         {
00241             int32_t j;
00242 
00243             if (weights_set[i]!=0.0)
00244                 for (j=0; j<dimensions; j++)
00245                     mus[i*dimensions+j] /= weights_set[i];
00246         }
00247     }
00248     else 
00249     {
00250         ASSERT(mus_start);
00251 
00252         sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions);
00253 
00254         for (i=0; i<XSize; i++)
00255         {
00256             float64_t mini=dists[i*k];
00257             int32_t Cl = 0, j;
00258 
00259             for (j=1; j<k; j++)
00260             {
00261                 if (dists[i*k+j]<mini)
00262                 {
00263                     Cl=j;
00264                     mini=dists[i*k+j];
00265                 }
00266             }
00267             ClList[i]=Cl;
00268         }
00269 
00270         /* Compute the sum of all points belonging to a cluster 
00271          * and count the points */
00272         for (i=0; i<XSize; i++) 
00273         {
00274             const int32_t Cl = ClList[i];
00275             float64_t weight=Weights[i];
00276             weights_set[Cl]+=weight;
00277 #ifndef MUSRECALC
00278             vec=lhs->get_feature_vector(i, vlen, vfree);
00279 
00280             for (j=0; j<dimensions; j++)
00281                 mus[Cl*dimensions+j] += weight*vec[j];
00282 
00283             lhs->free_feature_vector(vec, i, vfree);
00284 #endif
00285         }
00286 #ifndef MUSRECALC
00287         /* normalization to get the mean */ 
00288         for (i=0; i<k; i++)
00289         {
00290             if (weights_set[i]!=0.0)
00291             {
00292                 int32_t j;
00293                 for (j=0; j<dimensions; j++)
00294                     mus[i*dimensions+j] /= weights_set[i];
00295             }
00296         }
00297 #endif
00298     }
00299 
00300     for (i=0; i<XDimk; i++) oldmus[i]=-1;
00301 
00302     while (changed && (iter<max_iter))
00303     {
00304         iter++;
00305         if (iter==max_iter-1)
00306             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00307 
00308         if (iter%1000 == 0)
00309             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00310         changed=0;
00311 
00312 #ifdef MUSRECALC
00313         /* mus=zeros(dimensions, k) ; */
00314         for (i=0; i<XDimk; i++) mus[i]=0;
00315 
00316         for (i=0; i<XSize; i++) 
00317         {
00318             int32_t j;
00319             int32_t Cl=ClList[i];
00320             float64_t weight=Weights[i];
00321 
00322             vec=lhs->get_feature_vector(i, vlen, vfree);
00323 
00324             for (j=0; j<dimensions; j++)
00325                 mus[Cl*dimensions+j] += weight*vec[j];
00326 
00327             lhs->free_feature_vector(vec, i, vfree);
00328         }
00329         for (i=0; i<k; i++)
00330         {
00331             int32_t j;
00332 
00333             if (weights_set[i]!=0.0)
00334                 for (j=0; j<dimensions; j++)
00335                     mus[i*dimensions+j] /= weights_set[i];
00336         }
00337 #endif
00338 
00339         for (i=0; i<XSize; i++)
00340         {
00341             /* ks=ceil(rand(1,XSize)*XSize) ; */
00342             const int32_t Pat= CMath::random(0, XSize-1);
00343             const int32_t ClList_Pat=ClList[Pat];
00344             int32_t imini, j;
00345             float64_t mini, weight;
00346 
00347             weight=Weights[Pat];
00348 
00349             /* compute the distance of this point to all centers */
00350             /* dists=sqdist(mus',XData) ; */
00351             sqdist(mus, lhs, dists, k, Pat, 1, dimensions);
00352 
00353             /* [mini,imini]=min(dists(:,i)) ; */
00354             imini=0 ; mini=dists[0];
00355             for (j=1; j<k; j++)
00356                 if (dists[j]<mini)
00357                 {
00358                     mini=dists[j];
00359                     imini=j;
00360                 }
00361 
00362             if (imini!=ClList_Pat)
00363             {
00364                 changed= changed + 1;
00365 
00366                 /* weights_set(imini) = weights_set(imini) + weight ; */
00367                 weights_set[imini]+= weight;
00368                 /* weights_set(j)     = weights_set(j)     - weight ; */
00369                 weights_set[ClList_Pat]-= weight;
00370 
00371                 /* mu_new=mu_old + (x - mu_old)/(n+1) */
00372                 /* mus(:,imini)=mus(:,imini) + (XData(:,i) - mus(:,imini)) * (weight / weights_set(imini)) ; */
00373                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00374 
00375                 for (j=0; j<dimensions; j++)
00376                     mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00377 
00378                 lhs->free_feature_vector(vec, Pat, vfree);
00379 
00380                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00381                 /* if weights_set(j)~=0 */
00382                 if (weights_set[ClList_Pat]!=0.0)
00383                 {
00384                     /* mus(:,j)=mus(:,j) - (XData(:,i) - mus(:,j)) * (weight/weights_set(j)) ; */
00385                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00386 
00387                     for (j=0; j<dimensions; j++)
00388                         mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00389                     lhs->free_feature_vector(vec, Pat, vfree);
00390                 }
00391                 else
00392                     /*  mus(:,j)=zeros(dimensions,1) ; */
00393                     for (j=0; j<dimensions; j++)
00394                         mus[ClList_Pat*dimensions+j]=0;
00395 
00396                 /* ClList(i)= imini ; */
00397                 ClList[Pat] = imini;
00398             }
00399         }
00400     }
00401 
00402     /* compute the ,,variances'' of the clusters */
00403     for (i=0; i<k; i++)
00404     {
00405         float64_t rmin1=0;
00406         float64_t rmin2=0;
00407 
00408         bool first_round=true;
00409 
00410         for (int32_t j=0; j<k; j++)
00411         {
00412             if (j!=i)
00413             {
00414                 int32_t l;
00415                 float64_t dist = 0;
00416 
00417                 for (l=0; l<dimensions; l++)
00418                     dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00419 
00420                 if (first_round)
00421                 {
00422                     rmin1=dist;
00423                     rmin2=dist;
00424                     first_round=false;
00425                 }
00426                 else
00427                 {
00428                     if ((dist<rmin2) && (dist>=rmin1))
00429                         rmin2=dist;
00430 
00431                     if (dist<rmin1) 
00432                     {
00433                         rmin2=rmin1;
00434                         rmin1=dist;
00435                     }
00436                 }
00437             }
00438         }
00439 
00440         R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00441     }
00442 
00443     free(ClList);
00444     free(weights_set);
00445     free(oldmus);
00446     free(dists);
00447     SG_UNREF(lhs);
00448 } 

SHOGUN Machine Learning Toolbox - Documentation