00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "features/Labels.h"
00013 #include "lib/Mathematics.h"
00014 #include "lib/DynamicArray.h"
00015 #include "lib/Time.h"
00016 #include "base/Parallel.h"
00017 #include "classifier/Classifier.h"
00018 #include "classifier/svm/libocas.h"
00019 #include "classifier/svm/WDSVMOcas.h"
00020 #include "features/StringFeatures.h"
00021 #include "features/Alphabet.h"
00022 #include "features/Labels.h"
00023
00024
00025 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00026 struct wdocas_thread_params_output
00027 {
00028 float32_t* out;
00029 int32_t* val;
00030 float64_t* output;
00031 CWDSVMOcas* wdocas;
00032 int32_t start;
00033 int32_t end;
00034 };
00035
00036 struct wdocas_thread_params_add
00037 {
00038 CWDSVMOcas* wdocas;
00039 float32_t* new_a;
00040 uint32_t* new_cut;
00041 int32_t start;
00042 int32_t end;
00043 uint32_t cut_length;
00044 };
00045 #endif // DOXYGEN_SHOULD_SKIP_THIS
00046
00047 CWDSVMOcas::CWDSVMOcas(E_SVM_TYPE type)
00048 : CClassifier(), use_bias(false), bufsize(3000), C1(1), C2(1),
00049 epsilon(1e-3), method(type)
00050 {
00051 w=NULL;
00052 old_w=NULL;
00053 degree=6;
00054 from_degree=40;
00055 wd_weights=NULL;
00056 w_offsets=NULL;
00057 normalization_const=1.0;
00058 }
00059
00060 CWDSVMOcas::CWDSVMOcas(
00061 float64_t C, int32_t d, int32_t from_d, CStringFeatures<uint8_t>* traindat,
00062 CLabels* trainlab)
00063 : CClassifier(), use_bias(false), bufsize(3000), C1(C), C2(C), epsilon(1e-3),
00064 degree(d), from_degree(from_d)
00065 {
00066 w=NULL;
00067 old_w=NULL;
00068 method=SVM_OCAS;
00069 features=traindat;
00070 set_labels(trainlab);
00071 wd_weights=NULL;
00072 w_offsets=NULL;
00073 normalization_const=1.0;
00074 }
00075
00076
00077 CWDSVMOcas::~CWDSVMOcas()
00078 {
00079 }
00080
00081 CLabels* CWDSVMOcas::classify(CLabels* output)
00082 {
00083 set_wd_weights();
00084 set_normalization_const();
00085
00086 if (features)
00087 {
00088 int32_t num=features->get_num_vectors();
00089 ASSERT(num>0);
00090
00091 if (!output)
00092 output=new CLabels(num);
00093
00094 for (int32_t i=0; i<num; i++)
00095 output->set_label(i, classify_example(i));
00096
00097 return output;
00098 }
00099
00100 return NULL;
00101 }
00102
00103 int32_t CWDSVMOcas::set_wd_weights()
00104 {
00105 ASSERT(degree>0 && degree<=8);
00106 delete[] wd_weights;
00107 wd_weights=new float32_t[degree];
00108 delete[] w_offsets;
00109 w_offsets=new int32_t[degree];
00110 int32_t w_dim_single_c=0;
00111
00112 for (int32_t i=0; i<degree; i++)
00113 {
00114 w_offsets[i]=CMath::pow(alphabet_size, i+1);
00115 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00116 w_dim_single_c+=w_offsets[i];
00117 }
00118 return w_dim_single_c;
00119 }
00120
00121 bool CWDSVMOcas::train()
00122 {
00123 SG_INFO("C=%f, epsilon=%f, bufsize=%d\n", get_C1(), get_epsilon(), bufsize);
00124
00125 ASSERT(labels);
00126 ASSERT(get_features());
00127 ASSERT(labels->is_two_class_labeling());
00128 CAlphabet* alphabet=get_features()->get_alphabet();
00129 ASSERT(alphabet && alphabet->get_alphabet()==RAWDNA);
00130
00131 alphabet_size=alphabet->get_num_symbols();
00132 string_length=features->get_num_vectors();
00133 int32_t num_train_labels=0;
00134 lab=labels->get_labels(num_train_labels);
00135
00136 w_dim_single_char=set_wd_weights();
00137
00138 SG_DEBUG("w_dim_single_char=%d\n", w_dim_single_char);
00139 w_dim=string_length*w_dim_single_char;
00140 SG_DEBUG("cutting plane has %d dims\n", w_dim);
00141 num_vec=get_features()->get_max_vector_length();
00142
00143 set_normalization_const();
00144 SG_INFO("num_vec: %d num_lab: %d\n", num_vec, num_train_labels);
00145 ASSERT(num_vec==num_train_labels);
00146 ASSERT(num_vec>0);
00147
00148 delete[] w;
00149 w=new float32_t[w_dim];
00150 memset(w, 0, w_dim*sizeof(float32_t));
00151
00152 delete[] old_w;
00153 old_w=new float32_t[w_dim];
00154 memset(old_w, 0, w_dim*sizeof(float32_t));
00155 bias=0;
00156 old_bias=0;
00157
00158 cuts=new float32_t*[bufsize];
00159 memset(cuts, 0, sizeof(*cuts)*bufsize);
00160 cp_bias=new float64_t[bufsize];
00161 memset(cp_bias, 0, sizeof(float64_t)*bufsize);
00162
00164
00165
00166
00167
00168
00169
00170
00171
00173 float64_t TolAbs=0;
00174 float64_t QPBound=0;
00175 uint8_t Method=0;
00176 if (method == SVM_OCAS)
00177 Method = 1;
00178 ocas_return_value_T result = svm_ocas_solver( get_C1(), num_vec, get_epsilon(),
00179 TolAbs, QPBound, bufsize, Method,
00180 &CWDSVMOcas::compute_W,
00181 &CWDSVMOcas::update_W,
00182 &CWDSVMOcas::add_new_cut,
00183 &CWDSVMOcas::compute_output,
00184 &CWDSVMOcas::sort,
00185 this);
00186
00187 SG_INFO("Ocas Converged after %d iterations\n"
00188 "==================================\n"
00189 "timing statistics:\n"
00190 "output_time: %f s\n"
00191 "sort_time: %f s\n"
00192 "add_time: %f s\n"
00193 "w_time: %f s\n"
00194 "solver_time %f s\n"
00195 "ocas_time %f s\n\n", result.nIter, result.output_time, result.sort_time,
00196 result.add_time, result.w_time, result.solver_time, result.ocas_time);
00197
00198 for (int32_t i=bufsize-1; i>=0; i--)
00199 delete[] cuts[i];
00200 delete[] cuts;
00201
00202 delete[] lab;
00203 lab=NULL;
00204
00205 SG_UNREF(alphabet);
00206
00207 return true;
00208 }
00209
00210
00211
00212
00213
00214
00215
00216
00217 float64_t CWDSVMOcas::update_W( float64_t t, void* ptr )
00218 {
00219 float64_t sq_norm_W = 0;
00220 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00221 uint32_t nDim = (uint32_t) o->w_dim;
00222 float32_t* W=o->w;
00223 float32_t* oldW=o->old_w;
00224 float64_t bias=o->bias;
00225 float64_t old_bias=bias;
00226
00227 for(uint32_t j=0; j <nDim; j++)
00228 {
00229 W[j] = oldW[j]*(1-t) + t*W[j];
00230 sq_norm_W += W[j]*W[j];
00231 }
00232
00233 bias=old_bias*(1-t) + t*bias;
00234 sq_norm_W += CMath::sq(bias);
00235
00236 o->bias=bias;
00237 o->old_bias=old_bias;
00238
00239 return( sq_norm_W );
00240 }
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250 void* CWDSVMOcas::add_new_cut_helper( void* ptr)
00251 {
00252 wdocas_thread_params_add* p = (wdocas_thread_params_add*) ptr;
00253 CWDSVMOcas* o = p->wdocas;
00254 int32_t start = p->start;
00255 int32_t end = p->end;
00256 int32_t string_length = o->string_length;
00257
00258 uint32_t cut_length=p->cut_length;
00259 uint32_t* new_cut=p->new_cut;
00260 int32_t* w_offsets = o->w_offsets;
00261 float64_t* y = o->lab;
00262 int32_t alphabet_size = o->alphabet_size;
00263 float32_t* wd_weights = o->wd_weights;
00264 int32_t degree = o->degree;
00265 CStringFeatures<uint8_t>* f = o->features;
00266 float64_t normalization_const = o->normalization_const;
00267
00268
00269 float32_t* new_a = p->new_a;
00270
00271
00272
00273 int32_t* val=new int32_t[cut_length];
00274 for (int32_t j=start; j<end; j++)
00275 {
00276 int32_t offs=o->w_dim_single_char*j;
00277 memset(val,0,sizeof(int32_t)*cut_length);
00278 int32_t lim=CMath::min(degree, string_length-j);
00279 int32_t len;
00280
00281 for (int32_t k=0; k<lim; k++)
00282 {
00283 uint8_t* vec = f->get_feature_vector(j+k, len);
00284 float32_t wd = wd_weights[k]/normalization_const;
00285
00286 for(uint32_t i=0; i < cut_length; i++)
00287 {
00288 val[i]=val[i]*alphabet_size + vec[new_cut[i]];
00289 new_a[offs+val[i]]+=wd * y[new_cut[i]];
00290 }
00291 offs+=w_offsets[k];
00292 }
00293 }
00294
00295
00296 delete[] val;
00297 return NULL;
00298 }
00299
00300 void CWDSVMOcas::add_new_cut(
00301 float64_t *new_col_H, uint32_t *new_cut, uint32_t cut_length,
00302 uint32_t nSel, void* ptr)
00303 {
00304 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00305 int32_t string_length = o->string_length;
00306 uint32_t nDim=(uint32_t) o->w_dim;
00307 float32_t** cuts=o->cuts;
00308 float64_t* c_bias = o->cp_bias;
00309
00310 uint32_t i;
00311 wdocas_thread_params_add* params_add=new wdocas_thread_params_add[o->parallel->get_num_threads()];
00312 pthread_t* threads=new pthread_t[o->parallel->get_num_threads()];
00313 float32_t* new_a=new float32_t[nDim];
00314 memset(new_a, 0, sizeof(float32_t)*nDim);
00315
00316 int32_t t;
00317 int32_t nthreads=o->parallel->get_num_threads()-1;
00318 int32_t end=0;
00319 int32_t step= string_length/o->parallel->get_num_threads();
00320
00321 if (step<1)
00322 {
00323 nthreads=string_length-1;
00324 step=1;
00325 }
00326
00327 for (t=0; t<nthreads; t++)
00328 {
00329 params_add[t].wdocas=o;
00330
00331 params_add[t].new_a=new_a;
00332 params_add[t].new_cut=new_cut;
00333 params_add[t].start = step*t;
00334 params_add[t].end = step*(t+1);
00335 params_add[t].cut_length = cut_length;
00336
00337 if (pthread_create(&threads[t], NULL, &CWDSVMOcas::add_new_cut_helper, (void*)¶ms_add[t]) != 0)
00338 {
00339 nthreads=t;
00340 SG_SWARNING("thread creation failed\n");
00341 break;
00342 }
00343 end=params_add[t].end;
00344 }
00345
00346 params_add[t].wdocas=o;
00347
00348 params_add[t].new_a=new_a;
00349 params_add[t].new_cut=new_cut;
00350 params_add[t].start = step*t;
00351 params_add[t].end = string_length;
00352 params_add[t].cut_length = cut_length;
00353 add_new_cut_helper(¶ms_add[t]);
00354
00355
00356 for (t=0; t<nthreads; t++)
00357 {
00358 if (pthread_join(threads[t], NULL) != 0)
00359 SG_SWARNING( "pthread_join failed\n");
00360
00361
00362
00363
00364
00365 }
00366
00367 for(i=0; i < cut_length; i++)
00368 {
00369 if (o->use_bias)
00370 c_bias[nSel]+=o->lab[new_cut[i]];
00371 }
00372
00373
00374 for(i=0; i < nSel; i++)
00375 new_col_H[i] = CMath::dot(new_a, cuts[i], nDim) + c_bias[nSel]*c_bias[i];
00376 new_col_H[nSel] = CMath::dot(new_a, new_a, nDim) + CMath::sq(c_bias[nSel]);
00377
00378 cuts[nSel]=new_a;
00379
00380
00381
00382 delete[] threads;
00383 delete[] params_add;
00384 }
00385
00386 void CWDSVMOcas::sort( float64_t* vals, uint32_t* idx, uint32_t size)
00387 {
00388 CMath::qsort_index(vals, idx, size);
00389 }
00390
00391
00392
00393
00394
00395
00396 void* CWDSVMOcas::compute_output_helper(void* ptr)
00397 {
00398 wdocas_thread_params_output* p = (wdocas_thread_params_output*) ptr;
00399 CWDSVMOcas* o = p->wdocas;
00400 int32_t start = p->start;
00401 int32_t end = p->end;
00402 float32_t* out = p->out;
00403 float64_t* output = p->output;
00404 int32_t* val = p->val;
00405
00406 CStringFeatures<uint8_t>* f=o->get_features();
00407
00408 int32_t degree = o->degree;
00409 int32_t string_length = o->string_length;
00410 int32_t alphabet_size = o->alphabet_size;
00411 int32_t* w_offsets = o->w_offsets;
00412 float32_t* wd_weights = o->wd_weights;
00413 float32_t* w= o->w;
00414
00415 float64_t* y = o->lab;
00416 float64_t normalization_const = o->normalization_const;
00417
00418
00419 for (int32_t j=0; j<string_length; j++)
00420 {
00421 int32_t offs=o->w_dim_single_char*j;
00422 for (int32_t i=start ; i<end; i++)
00423 val[i]=0;
00424
00425 int32_t lim=CMath::min(degree, string_length-j);
00426 int32_t len;
00427
00428 for (int32_t k=0; k<lim; k++)
00429 {
00430 uint8_t* vec=f->get_feature_vector(j+k, len);
00431 float32_t wd = wd_weights[k];
00432
00433 for (int32_t i=start; i<end; i++)
00434 {
00435 val[i]=val[i]*alphabet_size + vec[i];
00436 out[i]+=wd*w[offs+val[i]];
00437 }
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474 offs+=w_offsets[k];
00475 }
00476 }
00477
00478 for (int32_t i=start; i<end; i++)
00479 output[i]=y[i]*o->bias + out[i]*y[i]/normalization_const;
00480
00481
00482
00483 return NULL;
00484 }
00485
00486 void CWDSVMOcas::compute_output( float64_t *output, void* ptr )
00487 {
00488 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00489 int32_t nData=o->num_vec;
00490 wdocas_thread_params_output* params_output=new wdocas_thread_params_output[o->parallel->get_num_threads()];
00491 pthread_t* threads = new pthread_t[o->parallel->get_num_threads()];
00492
00493 float32_t* out=new float32_t[nData];
00494 int32_t* val=new int32_t[nData];
00495 memset(out, 0, sizeof(float32_t)*nData);
00496
00497 int32_t t;
00498 int32_t nthreads=o->parallel->get_num_threads()-1;
00499 int32_t end=0;
00500 int32_t step= nData/o->parallel->get_num_threads();
00501
00502 if (step<1)
00503 {
00504 nthreads=nData-1;
00505 step=1;
00506 }
00507
00508 for (t=0; t<nthreads; t++)
00509 {
00510 params_output[t].wdocas=o;
00511 params_output[t].output=output;
00512 params_output[t].out=out;
00513 params_output[t].val=val;
00514 params_output[t].start = step*t;
00515 params_output[t].end = step*(t+1);
00516
00517
00518 if (pthread_create(&threads[t], NULL, &CWDSVMOcas::compute_output_helper, (void*)¶ms_output[t]) != 0)
00519 {
00520 nthreads=t;
00521 SG_SWARNING("thread creation failed\n");
00522 break;
00523 }
00524 end=params_output[t].end;
00525 }
00526
00527 params_output[t].wdocas=o;
00528 params_output[t].output=output;
00529 params_output[t].out=out;
00530 params_output[t].val=val;
00531 params_output[t].start = step*t;
00532 params_output[t].end = nData;
00533 compute_output_helper(¶ms_output[t]);
00534
00535
00536 for (t=0; t<nthreads; t++)
00537 {
00538 if (pthread_join(threads[t], NULL) != 0)
00539 SG_SWARNING( "pthread_join failed\n");
00540 }
00541 delete[] threads;
00542 delete[] params_output;
00543 delete[] val;
00544 delete[] out;
00545 }
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555 void CWDSVMOcas::compute_W(
00556 float64_t *sq_norm_W, float64_t *dp_WoldW, float64_t *alpha, uint32_t nSel,
00557 void* ptr)
00558 {
00559 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00560 uint32_t nDim= (uint32_t) o->w_dim;
00561 CMath::swap(o->w, o->old_w);
00562 float32_t* W=o->w;
00563 float32_t* oldW=o->old_w;
00564 float32_t** cuts=o->cuts;
00565 memset(W, 0, sizeof(float32_t)*nDim);
00566 float64_t* c_bias = o->cp_bias;
00567 float64_t old_bias=o->bias;
00568 float64_t bias=0;
00569
00570 for (uint32_t i=0; i<nSel; i++)
00571 {
00572 if (alpha[i] > 0)
00573 CMath::vec1_plus_scalar_times_vec2(W, (float32_t) alpha[i], cuts[i], nDim);
00574
00575 bias += c_bias[i]*alpha[i];
00576 }
00577
00578 *sq_norm_W = CMath::dot(W,W, nDim) +CMath::sq(bias);
00579 *dp_WoldW = CMath::dot(W,oldW, nDim) + bias*old_bias;;
00580
00581
00582 o->bias = bias;
00583 o->old_bias = old_bias;
00584 }