SVMSGD.cpp

Go to the documentation of this file.
00001 /*
00002    SVM with stochastic gradient
00003    Copyright (C) 2007- Leon Bottou
00004    
00005    This program is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; either
00008    version 2.1 of the License, or (at your option) any later version.
00009    
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014    
00015    You should have received a copy of the GNU General Public License
00016    along with this program; if not, write to the Free Software
00017    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
00018    $Id: svmsgd.cpp,v 1.13 2007/10/02 20:40:06 cvs Exp $
00019 
00020    Shogun adjustments (w) 2008 Soeren Sonnenburg
00021 */
00022 
00023 #include "classifier/svm/SVMSGD.h"
00024 
00025 // Available losses
00026 #define HINGELOSS 1
00027 #define SMOOTHHINGELOSS 2
00028 #define SQUAREDHINGELOSS 3
00029 #define LOGLOSS 10
00030 #define LOGLOSSMARGIN 11
00031 
00032 // Select loss
00033 #define LOSS HINGELOSS
00034 
00035 // One when bias is regularized
00036 #define REGULARIZEBIAS 0
00037 
00038 inline
00039 float64_t loss(float64_t z)
00040 {
00041 #if LOSS == LOGLOSS
00042     if (z >= 0)
00043         return log(1+exp(-z));
00044     else
00045         return -z + log(1+exp(z));
00046 #elif LOSS == LOGLOSSMARGIN
00047     if (z >= 1)
00048         return log(1+exp(1-z));
00049     else
00050         return 1-z + log(1+exp(z-1));
00051 #elif LOSS == SMOOTHHINGELOSS
00052     if (z < 0)
00053         return 0.5 - z;
00054     if (z < 1)
00055         return 0.5 * (1-z) * (1-z);
00056     return 0;
00057 #elif LOSS == SQUAREDHINGELOSS
00058     if (z < 1)
00059         return 0.5 * (1 - z) * (1 - z);
00060     return 0;
00061 #elif LOSS == HINGELOSS
00062     if (z < 1)
00063         return 1 - z;
00064     return 0;
00065 #else
00066 # error "Undefined loss"
00067 #endif
00068 }
00069 
00070 inline
00071 float64_t dloss(float64_t z)
00072 {
00073 #if LOSS == LOGLOSS
00074     if (z < 0)
00075         return 1 / (exp(z) + 1);
00076     float64_t ez = exp(-z);
00077     return ez / (ez + 1);
00078 #elif LOSS == LOGLOSSMARGIN
00079     if (z < 1)
00080         return 1 / (exp(z-1) + 1);
00081     float64_t ez = exp(1-z);
00082     return ez / (ez + 1);
00083 #elif LOSS == SMOOTHHINGELOSS
00084     if (z < 0)
00085         return 1;
00086     if (z < 1)
00087         return 1-z;
00088     return 0;
00089 #elif LOSS == SQUAREDHINGELOSS
00090     if (z < 1)
00091         return (1 - z);
00092     return 0;
00093 #else
00094     if (z < 1)
00095         return 1;
00096     return 0;
00097 #endif
00098 }
00099 
00100 
00101 
00102 CSVMSGD::CSVMSGD(float64_t C)
00103 : CLinearClassifier(), t(1), C1(C), C2(C),
00104     wscale(1), bscale(1), epochs(5), skip(1000), count(1000), use_bias(true),
00105     use_regularized_bias(false)
00106 {
00107 }
00108 
00109 CSVMSGD::CSVMSGD(float64_t C, CDotFeatures* traindat, CLabels* trainlab)
00110 : CLinearClassifier(), t(1), C1(C), C2(C), wscale(1), bscale(1),
00111     epochs(5), skip(1000), count(1000), use_bias(true),
00112     use_regularized_bias(false)
00113 {
00114     w=NULL;
00115     set_features(traindat);
00116     set_labels(trainlab);
00117 }
00118 
00119 CSVMSGD::~CSVMSGD()
00120 {
00121     delete[] w;
00122     w=NULL;
00123 }
00124 
00125 bool CSVMSGD::train()
00126 {
00127     // allocate memory for w and initialize everyting w and bias with 0
00128     ASSERT(labels);
00129     ASSERT(features);
00130     ASSERT(labels->is_two_class_labeling());
00131 
00132     int32_t num_train_labels=labels->get_num_labels();
00133     w_dim=features->get_dim_feature_space();
00134     int32_t num_vec=features->get_num_vectors();
00135 
00136     ASSERT(num_vec==num_train_labels);
00137     ASSERT(num_vec>0);
00138 
00139     delete[] w;
00140     w=new float64_t[w_dim];
00141     memset(w, 0, w_dim*sizeof(float64_t));
00142     bias=0;
00143 
00144     float64_t lambda= 1.0/(C1*num_vec);
00145 
00146     // Shift t in order to have a 
00147     // reasonable initial learning rate.
00148     // This assumes |x| \approx 1.
00149     float64_t maxw = 1.0 / sqrt(lambda);
00150     float64_t typw = sqrt(maxw);
00151     float64_t eta0 = typw / CMath::max(1.0,dloss(-typw));
00152     t = 1 / (eta0 * lambda);
00153 
00154     SG_INFO("lambda=%f, epochs=%d, eta0=%f\n", lambda, epochs, eta0);
00155 
00156 
00157     //do the sgd
00158     calibrate();
00159 
00160     SG_INFO("Training on %d vectors\n", num_vec);
00161     for(int32_t e=0; e<epochs; e++)
00162     {
00163         count = skip;
00164         for (int32_t i=0; i<num_vec; i++)
00165         {
00166             float64_t eta = 1.0 / (lambda * t);
00167             float64_t y = labels->get_label(i);
00168             float64_t z = y * (features->dense_dot(i, w, w_dim) + bias);
00169 
00170 #if LOSS < LOGLOSS
00171             if (z < 1)
00172 #endif
00173             {
00174                 float64_t etd = eta * dloss(z);
00175                 features->add_to_dense_vec(etd * y / wscale, i, w, w_dim);
00176 
00177                 if (use_bias)
00178                 {
00179                     if (use_regularized_bias)
00180                         bias *= 1 - eta * lambda * bscale;
00181                     bias += etd * y * bscale;
00182                 }
00183             }
00184 
00185             if (--count <= 0)
00186             {
00187                 float64_t r = 1 - eta * lambda * skip;
00188                 if (r < 0.8)
00189                     r = pow(1 - eta * lambda, skip);
00190                 CMath::scale_vector(r, w, w_dim);
00191                 count = skip;
00192             }
00193             t++;
00194         }
00195     }
00196 
00197     float64_t wnorm =  CMath::dot(w,w, w_dim);
00198     SG_INFO("Norm: %.6f, Bias: %.6f\n", wnorm, bias);
00199 
00200     return true;
00201 }
00202 
00203 void CSVMSGD::calibrate()
00204 { 
00205     ASSERT(features);
00206     int32_t num_vec=features->get_num_vectors();
00207     int32_t c_dim=features->get_dim_feature_space();
00208 
00209     ASSERT(num_vec>0);
00210     ASSERT(c_dim>0);
00211 
00212     float64_t* c=new float64_t[c_dim];
00213     memset(c, 0, c_dim*sizeof(float64_t));
00214 
00215     SG_INFO("Estimating sparsity and bscale num_vec=%d num_feat=%d.\n", num_vec, c_dim);
00216 
00217     // compute average gradient size
00218     int32_t n = 0;
00219     float64_t m = 0;
00220     float64_t r = 0;
00221 
00222     for (int32_t j=0; j<num_vec && m<=1000; j++, n++)
00223     {
00224         r += features->get_nnz_features_for_vector(j);
00225         features->add_to_dense_vec(1, j, c, c_dim, true);
00226 
00227         //waste cpu cycles for readability
00228         //(only changed dims need checking)
00229         m=CMath::max(c, c_dim);
00230     }
00231 
00232     // bias update scaling
00233     bscale = m/n;
00234 
00235     // compute weight decay skip
00236     skip = (int32_t) ((16 * n * c_dim) / r);
00237     SG_INFO("using %d examples. skip=%d  bscale=%.6f\n", n, skip, bscale);
00238 
00239     delete[] c;
00240 }
00241 

SHOGUN Machine Learning Toolbox - Documentation