KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/SimpleFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018 
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 using namespace shogun;
00028 
00029 CKMeans::CKMeans()
00030 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00031     mus(NULL), Weights(NULL)
00032 {
00033 }
00034 
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00037     mus(NULL), Weights(NULL)
00038 {
00039     set_distance(d);
00040 }
00041 
00042 CKMeans::~CKMeans()
00043 {
00044     delete[] R;
00045     delete[] mus;
00046 }
00047 
00048 bool CKMeans::train(CFeatures* data)
00049 {
00050     ASSERT(distance);
00051 
00052     if (data)
00053         distance->init(data, data);
00054 
00055     ASSERT(distance->get_feature_type()==F_DREAL);
00056 
00057     CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs();
00058     ASSERT(lhs);
00059     int32_t num=lhs->get_num_vectors();
00060     SG_UNREF(lhs);
00061 
00062     Weights=new float64_t[num];
00063     for (int32_t i=0; i<num; i++)
00064         Weights[i]=1.0;
00065 
00066     clustknb(false, NULL);
00067     delete[] Weights;
00068 
00069     return true;
00070 }
00071 
00072 bool CKMeans::load(FILE* srcfile)
00073 {
00074     return false;
00075 }
00076 
00077 bool CKMeans::save(FILE* dstfile)
00078 {
00079     return false;
00080 }
00081 
00082 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00083 struct thread_data
00084 {
00085     float64_t* x;
00086     CSimpleFeatures<float64_t>* y;
00087     float64_t* z;
00088     int32_t n1, n2, m;
00089     int32_t js, je; /* defines the matrix stripe */
00090     int32_t offs;
00091 };
00092 #endif // DOXYGEN_SHOULD_SKIP_THIS
00093 
00094 namespace shogun
00095 {
00096 void *sqdist_thread_func(void * P)
00097 {
00098     struct thread_data *TD=(struct thread_data*) P;
00099     float64_t* x=TD->x;
00100     CSimpleFeatures<float64_t>* y=TD->y;
00101     float64_t* z=TD->z;
00102     int32_t n1=TD->n1,
00103         m=TD->m,
00104         js=TD->js,
00105         je=TD->je,
00106         offs=TD->offs,
00107         j,i,k;
00108 
00109     for (j=js; j<je; j++)
00110     {
00111         int32_t vlen=0;
00112         bool vfree=false;
00113         float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00114 
00115         for (i=0; i<n1; i++)
00116         {
00117             float64_t sum=0;
00118             for (k=0; k<m; k++) 
00119                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00120             z[j*n1 + i] = sum;
00121         }
00122 
00123         y->free_feature_vector(vec, j, vfree);
00124     }
00125     return NULL;
00126 } 
00127 }
00128 
00129 void CKMeans::sqdist(
00130     float64_t* x, CSimpleFeatures<float64_t>* y, float64_t* z, int32_t n1, int32_t offs,
00131     int32_t n2, int32_t m)
00132 {
00133     const int32_t num_threads=parallel->get_num_threads();
00134     int32_t nc, n2_nc = n2/num_threads;
00135     thread_data* TD = new thread_data[num_threads];
00136     pthread_t* tid = new pthread_t[num_threads];
00137     void *status;
00138 
00139     /* prepare the structure */
00140     TD[0].x=x ; TD[0].y=y ; TD[0].z=z ; 
00141     TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00142     TD[0].offs=offs;
00143 
00144     if (n2>PAR_THRESH)
00145     {
00146         ASSERT(PAR_THRESH>1);
00147 
00148         /* create the threads */
00149         for (nc=0; nc<num_threads; nc++)
00150         {
00151             TD[nc]=TD[0];
00152             TD[nc].js=nc*n2_nc;
00153             if (nc+1==num_threads)
00154                 TD[nc].je=n2;
00155             else
00156                 TD[nc].je=(nc+1)*n2_nc;
00157 
00158             pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00159         }
00160 
00161         /* wait for finishing all threads */
00162         for (nc=0; nc<num_threads; nc++)
00163             pthread_join(tid[nc], &status);
00164 
00165     }
00166     else
00167     {
00168         /* simply call the ,,thread''-function */
00169         TD[0].js=0 ; TD[0].je=n2;
00170         sqdist_thread_func((void *)&TD[0]);
00171     }
00172 
00173     delete[] tid;
00174     delete[] TD;
00175 }
00176 
00177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00178 {
00179     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00180     CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00181     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00182 
00183     int32_t XSize=lhs->get_num_vectors();
00184     dimensions=lhs->get_num_features();
00185     int32_t i, changed=1;
00186     const int32_t XDimk=dimensions*k;
00187     int32_t iter=0;
00188 
00189     delete[] R;
00190     R=new float64_t[k];
00191 
00192     delete[] mus;
00193     mus=new float64_t[XDimk];
00194 
00195     int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00196     float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00197     float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00199         CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00200         CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00201  
00202     int32_t vlen=0;
00203     bool vfree=false;
00204     float64_t* vec=NULL;
00205 
00206     /* ClList=zeros(XSize,1) ; */
00207     for (i=0; i<XSize; i++) ClList[i]=0;
00208     /* weights_set=zeros(k,1) ; */
00209     for (i=0; i<k; i++) weights_set[i]=0;
00210 
00211     /* mus=zeros(dimensions, k) ; */
00212     for (i=0; i<XDimk; i++) mus[i]=0;
00213 
00214     if (!use_old_mus)
00215     {
00216         /* random clustering (select random subsets) */
00217         /*  ks=ceil(rand(1,XSize)*k);
00218          *  for i=1:k,
00219          *  actks= (ks==i);
00220          *  c=sum(actks);
00221          *  weights_set(i)=c;
00222          *
00223          *  ClList(actks)=i*ones(1, c);
00224          *
00225          *  if ~mus_recalc,
00226          *      if c>1
00227          *          mus(:,i) = sum(XData(:,actks)')'/c;
00228          *      elseif c>0
00229          *          mus(:,i) = XData(:,actks);
00230          *      end;
00231          *  end;
00232          *   end ; */
00233 
00234         for (i=0; i<XSize; i++) 
00235         {
00236             const int32_t Cl=CMath::random(0, k-1);
00237             int32_t j;
00238             float64_t weight=Weights[i];
00239 
00240             weights_set[Cl]+=weight;
00241             ClList[i]=Cl;
00242 
00243             vec=lhs->get_feature_vector(i, vlen, vfree);
00244 
00245             for (j=0; j<dimensions; j++)
00246                 mus[Cl*dimensions+j] += weight*vec[j];
00247 
00248             lhs->free_feature_vector(vec, i, vfree);
00249         }
00250         for (i=0; i<k; i++)
00251         {
00252             int32_t j;
00253 
00254             if (weights_set[i]!=0.0)
00255                 for (j=0; j<dimensions; j++)
00256                     mus[i*dimensions+j] /= weights_set[i];
00257         }
00258     }
00259     else 
00260     {
00261         ASSERT(mus_start);
00262 
00263 //      sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions);
00265                 rhs_mus->copy_feature_matrix(mus_start,dimensions,k);
00266                 float64_t* p_dists=dists;
00267                 
00268                 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00269                     distances_rhs(p_dists,0,k,idx);
00270         p_dists=NULL;            
00271 
00272         for (i=0; i<XSize; i++)
00273         {
00274             float64_t mini=dists[i*k];
00275             int32_t Cl = 0, j;
00276 
00277             for (j=1; j<k; j++)
00278             {
00279                 if (dists[i*k+j]<mini)
00280                 {
00281                     Cl=j;
00282                     mini=dists[i*k+j];
00283                 }
00284             }
00285             ClList[i]=Cl;
00286         }
00287 
00288         /* Compute the sum of all points belonging to a cluster 
00289          * and count the points */
00290         for (i=0; i<XSize; i++) 
00291         {
00292             const int32_t Cl = ClList[i];
00293             float64_t weight=Weights[i];
00294             weights_set[Cl]+=weight;
00295 #ifndef MUSRECALC
00296             vec=lhs->get_feature_vector(i, vlen, vfree);
00297 
00298             for (j=0; j<dimensions; j++)
00299                 mus[Cl*dimensions+j] += weight*vec[j];
00300 
00301             lhs->free_feature_vector(vec, i, vfree);
00302 #endif
00303         }
00304 #ifndef MUSRECALC
00305         /* normalization to get the mean */ 
00306         for (i=0; i<k; i++)
00307         {
00308             if (weights_set[i]!=0.0)
00309             {
00310                 int32_t j;
00311                 for (j=0; j<dimensions; j++)
00312                     mus[i*dimensions+j] /= weights_set[i];
00313             }
00314         }
00315 #endif
00316     }
00317 
00318 
00319 
00320     while (changed && (iter<max_iter))
00321     {
00322         iter++;
00323         if (iter==max_iter-1)
00324             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00325 
00326         if (iter%1000 == 0)
00327             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00328         changed=0;
00329 
00330 #ifdef MUSRECALC
00331         /* mus=zeros(dimensions, k) ; */
00332         for (i=0; i<XDimk; i++) mus[i]=0;
00333 
00334         for (i=0; i<XSize; i++) 
00335         {
00336             int32_t j;
00337             int32_t Cl=ClList[i];
00338             float64_t weight=Weights[i];
00339 
00340             vec=lhs->get_feature_vector(i, vlen, vfree);
00341 
00342             for (j=0; j<dimensions; j++)
00343                 mus[Cl*dimensions+j] += weight*vec[j];
00344 
00345             lhs->free_feature_vector(vec, i, vfree);
00346         }
00347         for (i=0; i<k; i++)
00348         {
00349             int32_t j;
00350 
00351             if (weights_set[i]!=0.0)
00352                 for (j=0; j<dimensions; j++)
00353                     mus[i*dimensions+j] /= weights_set[i];
00354         }
00355 #endif
00356 
00357                 rhs_mus->copy_feature_matrix(mus,dimensions,k);
00358 
00359         for (i=0; i<XSize; i++)
00360         {
00361             /* ks=ceil(rand(1,XSize)*XSize) ; */
00362             const int32_t Pat= CMath::random(0, XSize-1);
00363             const int32_t ClList_Pat=ClList[Pat];
00364             int32_t imini, j;
00365             float64_t mini, weight;
00366 
00367             weight=Weights[Pat];
00368 
00369             /* compute the distance of this point to all centers */
00370             /* dists=sqdist(mus',XData) ; */
00372                         for(int32_t idx_k=0;idx_k<k;idx_k++)
00373                             dists[idx_k]=distance->distance(Pat,idx_k);
00374            
00375             /* [mini,imini]=min(dists(:,i)) ; */
00376             imini=0 ; mini=dists[0];
00377             for (j=1; j<k; j++)
00378                 if (dists[j]<mini)
00379                 {
00380                     mini=dists[j];
00381                     imini=j;
00382                 }
00383 
00384             if (imini!=ClList_Pat)
00385             {
00386                 changed= changed + 1;
00387 
00388                 /* weights_set(imini) = weights_set(imini) + weight ; */
00389                 weights_set[imini]+= weight;
00390                 /* weights_set(j)     = weights_set(j)     - weight ; */
00391                 weights_set[ClList_Pat]-= weight;
00392 
00393                 /* mu_new=mu_old + (x - mu_old)/(n+1) */
00394                 /* mus(:,imini)=mus(:,imini) + (XData(:,i) - mus(:,imini)) * (weight / weights_set(imini)) ; */
00395                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00396 
00397                 for (j=0; j<dimensions; j++)
00398                     mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00399 
00400                 lhs->free_feature_vector(vec, Pat, vfree);
00401 
00402                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00403                 /* if weights_set(j)~=0 */
00404                 if (weights_set[ClList_Pat]!=0.0)
00405                 {
00406                     /* mus(:,j)=mus(:,j) - (XData(:,i) - mus(:,j)) * (weight/weights_set(j)) ; */
00407                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00408 
00409                     for (j=0; j<dimensions; j++)
00410                         mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00411                     lhs->free_feature_vector(vec, Pat, vfree);
00412                 }
00413                 else
00414                     /*  mus(:,j)=zeros(dimensions,1) ; */
00415                     for (j=0; j<dimensions; j++)
00416                         mus[ClList_Pat*dimensions+j]=0;
00417 
00418                 /* ClList(i)= imini ; */
00419                 ClList[Pat] = imini;
00420             }
00421         }
00422     }
00423 
00424     /* compute the ,,variances'' of the clusters */
00425     for (i=0; i<k; i++)
00426     {
00427         float64_t rmin1=0;
00428         float64_t rmin2=0;
00429 
00430         bool first_round=true;
00431 
00432         for (int32_t j=0; j<k; j++)
00433         {
00434             if (j!=i)
00435             {
00436                 int32_t l;
00437                 float64_t dist = 0;
00438 
00439                 for (l=0; l<dimensions; l++)
00440                     dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00441 
00442                 if (first_round)
00443                 {
00444                     rmin1=dist;
00445                     rmin2=dist;
00446                     first_round=false;
00447                 }
00448                 else
00449                 {
00450                     if ((dist<rmin2) && (dist>=rmin1))
00451                         rmin2=dist;
00452 
00453                     if (dist<rmin1) 
00454                     {
00455                         rmin2=rmin1;
00456                         rmin1=dist;
00457                     }
00458                 }
00459             }
00460         }
00461 
00462         R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00463     }
00464         distance->replace_rhs(rhs_cache);
00465         delete rhs_mus;        
00466 
00467     free(ClList);
00468     free(weights_set);
00469     free(dists);
00470     SG_UNREF(lhs);
00471 } 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation