00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/config.h"
00013
00014 #include "lib/common.h"
00015 #include "lib/io.h"
00016 #include "lib/File.h"
00017 #include "lib/Time.h"
00018 #include "base/Parallel.h"
00019
00020 #include "kernel/Kernel.h"
00021 #include "kernel/IdentityKernelNormalizer.h"
00022 #include "features/Features.h"
00023
00024 #include "classifier/svm/SVM.h"
00025
00026 #include <string.h>
00027 #include <unistd.h>
00028 #include <math.h>
00029
00030 #ifndef WIN32
00031 #include <pthread.h>
00032 #endif
00033
00034 CKernel::CKernel(int32_t size)
00035 : CSGObject(), kernel_matrix(NULL), lhs(NULL),
00036 rhs(NULL), combined_kernel_weight(1), optimization_initialized(false),
00037 opt_type(FASTBUTMEMHUNGRY), properties(KP_NONE), normalizer(NULL)
00038 {
00039 if (size<10)
00040 size=10;
00041
00042 cache_size=size;
00043
00044
00045 if (get_is_initialized())
00046 SG_ERROR( "COptimizableKernel still initialized on destruction");
00047
00048 set_normalizer(new CIdentityKernelNormalizer());
00049 }
00050
00051
00052 CKernel::CKernel(CFeatures* p_lhs, CFeatures* p_rhs, int32_t size) : CSGObject(),
00053 kernel_matrix(NULL), lhs(NULL), rhs(NULL), combined_kernel_weight(1),
00054 optimization_initialized(false), opt_type(FASTBUTMEMHUNGRY),
00055 properties(KP_NONE), normalizer(NULL)
00056 {
00057 if (size<10)
00058 size=10;
00059
00060 cache_size=size;
00061
00062 if (get_is_initialized())
00063 SG_ERROR("Kernel initialized on construction.\n");
00064
00065 set_normalizer(new CIdentityKernelNormalizer());
00066 init(p_lhs, p_rhs);
00067 }
00068
00069 CKernel::~CKernel()
00070 {
00071 if (get_is_initialized())
00072 SG_ERROR("Kernel still initialized on destruction.\n");
00073
00074 remove_lhs_and_rhs();
00075 SG_UNREF(normalizer);
00076
00077 SG_INFO("Kernel deleted (%p).\n", this);
00078 }
00079
00080 void CKernel::get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n)
00081 {
00082 ASSERT(dst && m && n);
00083
00084 float64_t* result = NULL;
00085
00086 if (has_features())
00087 {
00088 int32_t num_vec1=get_num_vec_lhs();
00089 int32_t num_vec2=get_num_vec_rhs();
00090 *m=num_vec1;
00091 *n=num_vec2;
00092
00093 int64_t total_num = ((int64_t) num_vec1) * num_vec2;
00094 int32_t num_done = 0;
00095 SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00096
00097 result=(float64_t*) malloc(sizeof(float64_t)*total_num);
00098 ASSERT(result);
00099
00100 if ( lhs && lhs==rhs && num_vec1==num_vec2 )
00101 {
00102 for (int32_t i=0; i<num_vec1; i++)
00103 {
00104 for (int32_t j=i; j<num_vec1; j++)
00105 {
00106 float64_t v=kernel(i,j);
00107
00108 result[i+j*num_vec1]=v;
00109 result[j+i*num_vec1]=v;
00110
00111 if (num_done%100000)
00112 SG_PROGRESS(num_done, 0, total_num-1);
00113
00114 if (i!=j)
00115 num_done+=2;
00116 else
00117 num_done+=1;
00118 }
00119 }
00120 }
00121 else
00122 {
00123 for (int32_t i=0; i<num_vec1; i++)
00124 {
00125 for (int32_t j=0; j<num_vec2; j++)
00126 {
00127 result[i+j*num_vec1]=kernel(i,j) ;
00128
00129 if (num_done%100000)
00130 SG_PROGRESS(num_done, 0, total_num-1);
00131
00132 num_done++;
00133 }
00134 }
00135 }
00136
00137 SG_DONE();
00138 }
00139 else
00140 SG_ERROR( "no features assigned to kernel\n");
00141
00142 *dst=result;
00143 }
00144
00145 float32_t* CKernel::get_kernel_matrix_shortreal(
00146 int32_t &num_vec1, int32_t &num_vec2, float32_t* target)
00147 {
00148 float32_t* result = NULL;
00149
00150 if (has_features())
00151 {
00152 if (target && (num_vec1!=get_num_vec_lhs() ||
00153 num_vec2!=get_num_vec_rhs()) )
00154 SG_ERROR( "kernel matrix size mismatch\n");
00155
00156 num_vec1=get_num_vec_lhs();
00157 num_vec2=get_num_vec_rhs();
00158
00159 int64_t total_num = ((int64_t) num_vec1) * num_vec2;
00160 int32_t num_done = 0;
00161
00162 SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00163
00164 if (target)
00165 result=target;
00166 else
00167 result=new float32_t[total_num];
00168
00169 if (lhs && lhs==rhs && num_vec1==num_vec2)
00170 {
00171 for (int32_t i=0; i<num_vec1; i++)
00172 {
00173 for (int32_t j=i; j<num_vec1; j++)
00174 {
00175 float64_t v=kernel(i,j);
00176
00177 result[i+j*num_vec1]=v;
00178 result[j+i*num_vec1]=v;
00179
00180 if (num_done%100000)
00181 SG_PROGRESS(num_done, 0, total_num-1);
00182
00183 if (i!=j)
00184 num_done+=2;
00185 else
00186 num_done+=1;
00187 }
00188 }
00189 }
00190 else
00191 {
00192 for (int32_t i=0; i<num_vec1; i++)
00193 {
00194 for (int32_t j=0; j<num_vec2; j++)
00195 {
00196 result[i+j*num_vec1]=kernel(i,j) ;
00197
00198 if (num_done%100000)
00199 SG_PROGRESS(num_done, 0, total_num-1);
00200
00201 num_done++;
00202 }
00203 }
00204 }
00205
00206 SG_DONE();
00207 }
00208 else
00209 SG_ERROR( "no features assigned to kernel\n");
00210
00211 return result;
00212 }
00213
00214 float64_t* CKernel::get_kernel_matrix_real(
00215 int32_t &num_vec1, int32_t &num_vec2, float64_t* target)
00216 {
00217 float64_t* result = NULL;
00218
00219 if (has_features())
00220 {
00221 if (target && (num_vec1!=get_num_vec_lhs() ||
00222 num_vec2!=get_num_vec_rhs()) )
00223 SG_ERROR( "kernel matrix size mismatch\n");
00224
00225 num_vec1=get_num_vec_lhs();
00226 num_vec2=get_num_vec_rhs();
00227
00228 int64_t total_num = ((int64_t) num_vec1) * num_vec2;
00229 int32_t num_done = 0;
00230
00231 SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00232
00233 if (target)
00234 result=target;
00235 else
00236 result=new float64_t[total_num];
00237
00238 if (lhs && lhs==rhs && num_vec1==num_vec2)
00239 {
00240 for (int32_t i=0; i<num_vec1; i++)
00241 {
00242 for (int32_t j=i; j<num_vec1; j++)
00243 {
00244 float64_t v=kernel(i,j);
00245
00246 result[i+j*num_vec1]=v;
00247 result[j+i*num_vec1]=v;
00248
00249 if (num_done%100000)
00250 SG_PROGRESS(num_done, 0, total_num-1);
00251
00252 if (i!=j)
00253 num_done+=2;
00254 else
00255 num_done+=1;
00256 }
00257 }
00258 }
00259 else
00260 {
00261 for (int32_t i=0; i<num_vec1; i++)
00262 {
00263 for (int32_t j=0; j<num_vec2; j++)
00264 {
00265 result[i+j*num_vec1]=kernel(i,j) ;
00266
00267 if (num_done%100000)
00268 SG_PROGRESS(num_done, 0, total_num-1);
00269
00270 num_done++;
00271 }
00272 }
00273 }
00274
00275 SG_DONE();
00276 }
00277 else
00278 SG_ERROR( "no features assigned to kernel\n");
00279
00280 return result;
00281 }
00282
00283
00284
00285
00286 bool CKernel::init(CFeatures* l, CFeatures* r)
00287 {
00288
00289 ASSERT(l);
00290 ASSERT(r);
00291
00292
00293 ASSERT(l->get_feature_class()==r->get_feature_class());
00294 ASSERT(l->get_feature_type()==r->get_feature_type());
00295
00296
00297 remove_lhs_and_rhs();
00298
00299
00300 SG_REF(l);
00301 if (l!=r)
00302 SG_REF(r);
00303
00304 lhs=l;
00305 rhs=r;
00306
00307 return true;
00308 }
00309
00310 bool CKernel::set_normalizer(CKernelNormalizer* n)
00311 {
00312 SG_REF(n);
00313 SG_UNREF(normalizer);
00314 normalizer=n;
00315
00316 return (normalizer!=NULL);
00317 }
00318
00319 CKernelNormalizer* CKernel::get_normalizer()
00320 {
00321 SG_REF(normalizer)
00322 return normalizer;
00323 }
00324
00325 bool CKernel::init_normalizer()
00326 {
00327 return normalizer->init(this);
00328 }
00329
00330 void CKernel::cleanup()
00331 {
00332 remove_lhs_and_rhs();
00333 }
00334
00335
00336
00337 bool CKernel::load(char* fname)
00338 {
00339 return false;
00340 }
00341
00342 bool CKernel::save(char* fname)
00343 {
00344 int32_t i=0;
00345 int32_t num_left=get_num_vec_lhs();
00346 int32_t num_right=rhs->get_num_vectors();
00347 KERNELCACHE_IDX num_total=num_left*num_right;
00348
00349 CFile f(fname, 'w', F_DREAL);
00350
00351 for (int32_t l=0; l< (int32_t) num_left && f.is_ok(); l++)
00352 {
00353 for (int32_t r=0; r< (int32_t) num_right && f.is_ok(); r++)
00354 {
00355 if (!(i % (num_total/10+1)))
00356 SG_PRINT("%02d%%.", (int32_t) (100.0*i/num_total));
00357 else if (!(i % (num_total/200+1)))
00358 SG_PRINT(".");
00359
00360 float64_t k=kernel(l,r);
00361 f.save_real_data(&k, 1);
00362
00363 i++;
00364 }
00365 }
00366
00367 if (f.is_ok())
00368 SG_INFO( "kernel matrix of size %ld x %ld written (filesize: %ld)\n", num_left, num_right, num_total*sizeof(KERNELCACHE_ELEM));
00369
00370 return (f.is_ok());
00371 }
00372
00373 void CKernel::remove_lhs_and_rhs()
00374 {
00375 if (rhs!=lhs)
00376 SG_UNREF(rhs);
00377 rhs = NULL;
00378
00379 SG_UNREF(lhs);
00380 lhs = NULL;
00381
00382
00383 }
00384
00385 void CKernel::remove_lhs()
00386 {
00387 if (rhs==lhs)
00388 rhs=NULL;
00389 SG_UNREF(lhs);
00390 lhs = NULL;
00391
00392
00393 }
00394
00396 void CKernel::remove_rhs()
00397 {
00398 if (rhs!=lhs)
00399 SG_UNREF(rhs);
00400 rhs = NULL;
00401
00402
00403 }
00404
00405
00406 void CKernel::list_kernel()
00407 {
00408 SG_INFO( "0x%p - \"%s\" weight=%1.2f OPT:%s", this, get_name(),
00409 get_combined_kernel_weight(),
00410 get_optimization_type()==FASTBUTMEMHUNGRY ? "FASTBUTMEMHUNGRY" :
00411 "SLOWBUTMEMEFFICIENT");
00412
00413 switch (get_kernel_type())
00414 {
00415 case K_UNKNOWN:
00416 SG_INFO( "K_UNKNOWN ");
00417 break;
00418 case K_LINEAR:
00419 SG_INFO( "K_LINEAR ");
00420 break;
00421 case K_SPARSELINEAR:
00422 SG_INFO( "K_SPARSELINEAR ");
00423 break;
00424 case K_POLY:
00425 SG_INFO( "K_POLY ");
00426 break;
00427 case K_GAUSSIAN:
00428 SG_INFO( "K_GAUSSIAN ");
00429 break;
00430 case K_SPARSEGAUSSIAN:
00431 SG_INFO( "K_SPARSEGAUSSIAN ");
00432 break;
00433 case K_GAUSSIANSHIFT:
00434 SG_INFO( "K_GAUSSIANSHIFT ");
00435 break;
00436 case K_HISTOGRAM:
00437 SG_INFO( "K_HISTOGRAM ");
00438 break;
00439 case K_SALZBERG:
00440 SG_INFO( "K_SALZBERG ");
00441 break;
00442 case K_LOCALITYIMPROVED:
00443 SG_INFO( "K_LOCALITYIMPROVED ");
00444 break;
00445 case K_SIMPLELOCALITYIMPROVED:
00446 SG_INFO( "K_SIMPLELOCALITYIMPROVED ");
00447 break;
00448 case K_FIXEDDEGREE:
00449 SG_INFO( "K_FIXEDDEGREE ");
00450 break;
00451 case K_WEIGHTEDDEGREE:
00452 SG_INFO( "K_WEIGHTEDDEGREE ");
00453 break;
00454 case K_WEIGHTEDDEGREEPOS:
00455 SG_INFO( "K_WEIGHTEDDEGREEPOS ");
00456 break;
00457 case K_WEIGHTEDCOMMWORDSTRING:
00458 SG_INFO( "K_WEIGHTEDCOMMWORDSTRING ");
00459 break;
00460 case K_POLYMATCH:
00461 SG_INFO( "K_POLYMATCH ");
00462 break;
00463 case K_ALIGNMENT:
00464 SG_INFO( "K_ALIGNMENT ");
00465 break;
00466 case K_COMMWORDSTRING:
00467 SG_INFO( "K_COMMWORDSTRING ");
00468 break;
00469 case K_COMMULONGSTRING:
00470 SG_INFO( "K_COMMULONGSTRING ");
00471 break;
00472 case K_COMBINED:
00473 SG_INFO( "K_COMBINED ");
00474 break;
00475 case K_AUC:
00476 SG_INFO( "K_AUC ");
00477 break;
00478 case K_CUSTOM:
00479 SG_INFO( "K_CUSTOM ");
00480 break;
00481 case K_SIGMOID:
00482 SG_INFO( "K_SIGMOID ");
00483 break;
00484 case K_CHI2:
00485 SG_INFO( "K_CHI2 ");
00486 break;
00487 case K_DIAG:
00488 SG_INFO( "K_DIAG ");
00489 break;
00490 case K_CONST:
00491 SG_INFO( "K_CONST ");
00492 break;
00493 case K_MINDYGRAM:
00494 SG_INFO( "K_MINDYGRAM ");
00495 break;
00496 case K_DISTANCE:
00497 SG_INFO( "K_DISTANCE ");
00498 break;
00499 case K_LOCALALIGNMENT:
00500 SG_INFO( "K_LOCALALIGNMENT ");
00501 break;
00502 case K_TPPK:
00503 SG_INFO( "K_TPPK ");
00504 break;
00505 default:
00506 SG_ERROR( "ERROR UNKNOWN KERNEL TYPE");
00507 break;
00508 }
00509
00510 switch (get_feature_class())
00511 {
00512 case C_UNKNOWN:
00513 SG_INFO( "C_UNKNOWN ");
00514 break;
00515 case C_SIMPLE:
00516 SG_INFO( "C_SIMPLE ");
00517 break;
00518 case C_SPARSE:
00519 SG_INFO( "C_SPARSE ");
00520 break;
00521 case C_STRING:
00522 SG_INFO( "C_STRING ");
00523 break;
00524 case C_COMBINED:
00525 SG_INFO( "C_COMBINED ");
00526 break;
00527 case C_ANY:
00528 SG_INFO( "C_ANY ");
00529 break;
00530 default:
00531 SG_ERROR( "ERROR UNKNOWN FEATURE CLASS");
00532 }
00533
00534 switch (get_feature_type())
00535 {
00536 case F_UNKNOWN:
00537 SG_INFO( "F_UNKNOWN ");
00538 break;
00539 case F_DREAL:
00540 SG_INFO( "F_REAL ");
00541 break;
00542 case F_SHORT:
00543 SG_INFO( "F_SHORT ");
00544 break;
00545 case F_CHAR:
00546 SG_INFO( "F_CHAR ");
00547 break;
00548 case F_INT:
00549 SG_INFO( "F_INT ");
00550 break;
00551 case F_BYTE:
00552 SG_INFO( "F_BYTE ");
00553 break;
00554 case F_WORD:
00555 SG_INFO( "F_WORD ");
00556 break;
00557 case F_ULONG:
00558 SG_INFO( "F_ULONG ");
00559 break;
00560 case F_ANY:
00561 SG_INFO( "F_ANY ");
00562 break;
00563 default:
00564 SG_ERROR( "ERROR UNKNOWN FEATURE TYPE");
00565 break;
00566 }
00567 SG_INFO( "\n");
00568 }
00569
00570 bool CKernel::init_optimization(
00571 int32_t count, int32_t *IDX, float64_t * weights)
00572 {
00573 SG_ERROR( "kernel does not support linadd optimization\n");
00574 return false ;
00575 }
00576
00577 bool CKernel::delete_optimization()
00578 {
00579 SG_ERROR( "kernel does not support linadd optimization\n");
00580 return false;
00581 }
00582
00583 float64_t CKernel::compute_optimized(int32_t vector_idx)
00584 {
00585 SG_ERROR( "kernel does not support linadd optimization\n");
00586 return 0;
00587 }
00588
00589 void CKernel::compute_batch(
00590 int32_t num_vec, int32_t* vec_idx, float64_t* target, int32_t num_suppvec,
00591 int32_t* IDX, float64_t* weights, float64_t factor)
00592 {
00593 SG_ERROR( "kernel does not support batch computation\n");
00594 }
00595
00596 void CKernel::add_to_normal(int32_t vector_idx, float64_t weight)
00597 {
00598 SG_ERROR( "kernel does not support linadd optimization, add_to_normal not implemented\n");
00599 }
00600
00601 void CKernel::clear_normal()
00602 {
00603 SG_ERROR( "kernel does not support linadd optimization, clear_normal not implemented\n");
00604 }
00605
00606 int32_t CKernel::get_num_subkernels()
00607 {
00608 return 1;
00609 }
00610
00611 void CKernel::compute_by_subkernel(
00612 int32_t vector_idx, float64_t * subkernel_contrib)
00613 {
00614 SG_ERROR( "kernel compute_by_subkernel not implemented\n");
00615 }
00616
00617 const float64_t* CKernel::get_subkernel_weights(int32_t &num_weights)
00618 {
00619 num_weights=1 ;
00620 return &combined_kernel_weight ;
00621 }
00622
00623 void CKernel::set_subkernel_weights(float64_t* weights, int32_t num_weights)
00624 {
00625 combined_kernel_weight = weights[0] ;
00626 if (num_weights!=1)
00627 SG_ERROR( "number of subkernel weights should be one ...\n");
00628 }
00629
00630 bool CKernel::init_optimization_svm(CSVM * svm)
00631 {
00632 int32_t num_suppvec=svm->get_num_support_vectors();
00633 int32_t* sv_idx=new int32_t[num_suppvec];
00634 float64_t* sv_weight=new float64_t[num_suppvec];
00635
00636 for (int32_t i=0; i<num_suppvec; i++)
00637 {
00638 sv_idx[i] = svm->get_support_vector(i);
00639 sv_weight[i] = svm->get_alpha(i);
00640 }
00641 bool ret = init_optimization(num_suppvec, sv_idx, sv_weight);
00642
00643 delete[] sv_idx;
00644 delete[] sv_weight;
00645 return ret;
00646 }
00647