00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026
00027
00028 #ifdef HAVE_BOOST_SERIALIZATION
00029 #include <boost/serialization/export.hpp>
00030 BOOST_CLASS_EXPORT(shogun::CWeightedDegreeStringKernel);
00031 #endif //HAVE_BOOST_SERIALIZATION
00032
00033
00034 using namespace shogun;
00035
00036 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00037 struct S_THREAD_PARAM
00038 {
00039
00040 int32_t* vec;
00041 float64_t* result;
00042 float64_t* weights;
00043 CWeightedDegreeStringKernel* kernel;
00044 CTrie<DNATrie>* tries;
00045 float64_t factor;
00046 int32_t j;
00047 int32_t start;
00048 int32_t end;
00049 int32_t length;
00050 int32_t* vec_idx;
00051 };
00052 #endif // DOXYGEN_SHOULD_SKIP_THIS
00053
00054
00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00056 int32_t degree_, EWDKernType type_)
00057 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00058 weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00059 max_mismatch(0), seq_length(0), block_computation(true),
00060 num_block_weights_external(0), block_weights_external(NULL),
00061 block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00062 tree_initialized(false), alphabet(NULL)
00063 {
00064 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00065 lhs=NULL;
00066 rhs=NULL;
00067
00068 if (type!=E_EXTERNAL)
00069 set_wd_weights_by_type(type);
00070
00071 set_normalizer(new CFirstElementKernelNormalizer());
00072 }
00073
00074 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00075 float64_t *weights_, int32_t degree_)
00076 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00077 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00078 max_mismatch(0), seq_length(0), block_computation(true),
00079 num_block_weights_external(0), block_weights_external(NULL),
00080 block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00081 tree_initialized(false), alphabet(NULL)
00082 {
00083 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00084 lhs=NULL;
00085 rhs=NULL;
00086
00087 weights=new float64_t[degree*(1+max_mismatch)];
00088 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00089 weights[i]=weights_[i];
00090 set_normalizer(new CFirstElementKernelNormalizer());
00091 }
00092
00093 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00094 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00095 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00096 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00097 max_mismatch(0), seq_length(0), block_computation(true),
00098 num_block_weights_external(0), block_weights_external(NULL),
00099 block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00100 tree_initialized(false), alphabet(NULL)
00101 {
00102 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00103
00104 set_wd_weights_by_type(type);
00105 set_normalizer(new CFirstElementKernelNormalizer());
00106 init(l, r);
00107 }
00108
00109 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00110 {
00111 cleanup();
00112
00113 delete[] weights;
00114 weights=NULL;
00115
00116 delete[] block_weights;
00117 block_weights=NULL;
00118
00119 delete[] position_weights;
00120 position_weights=NULL;
00121
00122 delete[] weights_buffer;
00123 weights_buffer=NULL;
00124 }
00125
00126
00127 void CWeightedDegreeStringKernel::remove_lhs()
00128 {
00129 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00130 delete_optimization();
00131
00132 if (tries!=NULL)
00133 tries->destroy();
00134
00135 CKernel::remove_lhs();
00136 }
00137
00138 void CWeightedDegreeStringKernel::create_empty_tries()
00139 {
00140 ASSERT(lhs);
00141
00142 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00143
00144 if (tries!=NULL)
00145 {
00146 tries->destroy() ;
00147 tries->create(seq_length, max_mismatch==0) ;
00148 }
00149 }
00150
00151 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00152 {
00153 int32_t lhs_changed=(lhs!=l);
00154 int32_t rhs_changed=(rhs!=r);
00155
00156 CStringKernel<char>::init(l,r);
00157
00158 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00159 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00160
00161 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00162 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00163
00164 int32_t len=sf_l->get_max_vector_length();
00165 if (lhs_changed && !sf_l->have_same_length(len))
00166 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00167
00168 if (rhs_changed && !sf_r->have_same_length(len))
00169 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00170
00171 SG_UNREF(alphabet);
00172 alphabet=sf_l->get_alphabet();
00173 CAlphabet* ralphabet=sf_r->get_alphabet();
00174
00175 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00176 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00177
00178 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00179 SG_UNREF(ralphabet);
00180
00181 if (tries!=NULL) {
00182 tries->delete_trees(max_mismatch==0);
00183 SG_UNREF(tries);
00184 }
00185 tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00186 create_empty_tries();
00187
00188 init_block_weights();
00189
00190 return init_normalizer();
00191 }
00192
00193 void CWeightedDegreeStringKernel::cleanup()
00194 {
00195 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00196 delete_optimization();
00197
00198 delete[] block_weights;
00199 block_weights=NULL;
00200
00201 if (tries!=NULL)
00202 {
00203 tries->destroy();
00204 SG_UNREF(tries);
00205 tries=NULL;
00206 }
00207
00208 seq_length=0;
00209 tree_initialized = false;
00210
00211 SG_UNREF(alphabet);
00212 alphabet=NULL;
00213
00214 CKernel::cleanup();
00215 }
00216
00217 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00218 {
00219 if (tree_num<0)
00220 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00221
00222 delete_optimization();
00223
00224 if (tree_num<0)
00225 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00226
00227 for (int32_t i=0; i<count; i++)
00228 {
00229 if (tree_num<0)
00230 {
00231 if ( (i % (count/10+1)) == 0)
00232 SG_PROGRESS(i, 0, count);
00233
00234 if (max_mismatch==0)
00235 add_example_to_tree(IDX[i], alphas[i]) ;
00236 else
00237 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00238
00239
00240 }
00241 else
00242 {
00243 if (max_mismatch==0)
00244 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00245 else
00246 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00247 }
00248 }
00249
00250 if (tree_num<0)
00251 SG_DONE();
00252
00253
00254
00255 set_is_initialized(true) ;
00256 return true ;
00257 }
00258
00259 bool CWeightedDegreeStringKernel::delete_optimization()
00260 {
00261 if (get_is_initialized())
00262 {
00263 if (tries!=NULL)
00264 tries->delete_trees(max_mismatch==0);
00265 set_is_initialized(false);
00266 return true;
00267 }
00268
00269 return false;
00270 }
00271
00272
00273 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00274 char* avec, int32_t alen, char* bvec, int32_t blen)
00275 {
00276 float64_t sum = 0.0;
00277
00278 for (int32_t i=0; i<alen; i++)
00279 {
00280 float64_t sumi = 0.0;
00281 int32_t mismatches=0;
00282
00283 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00284 {
00285 if (avec[i+j]!=bvec[i+j])
00286 {
00287 mismatches++ ;
00288 if (mismatches>max_mismatch)
00289 break ;
00290 } ;
00291 sumi += weights[j+degree*mismatches];
00292 }
00293 if (position_weights!=NULL)
00294 sum+=position_weights[i]*sumi ;
00295 else
00296 sum+=sumi ;
00297 }
00298 return sum ;
00299 }
00300
00301 float64_t CWeightedDegreeStringKernel::compute_using_block(
00302 char* avec, int32_t alen, char* bvec, int32_t blen)
00303 {
00304 ASSERT(alen==blen);
00305
00306 float64_t sum=0;
00307 int32_t match_len=-1;
00308
00309 for (int32_t i=0; i<alen; i++)
00310 {
00311 if (avec[i]==bvec[i])
00312 match_len++;
00313 else
00314 {
00315 if (match_len>=0)
00316 sum+=block_weights[match_len];
00317 match_len=-1;
00318 }
00319 }
00320
00321 if (match_len>=0)
00322 sum+=block_weights[match_len];
00323
00324 return sum;
00325 }
00326
00327 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00328 char* avec, int32_t alen, char* bvec, int32_t blen)
00329 {
00330 float64_t sum = 0.0;
00331
00332 for (int32_t i=0; i<alen; i++)
00333 {
00334 float64_t sumi = 0.0;
00335
00336 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00337 {
00338 if (avec[i+j]!=bvec[i+j])
00339 break ;
00340 sumi += weights[j];
00341 }
00342 if (position_weights!=NULL)
00343 sum+=position_weights[i]*sumi ;
00344 else
00345 sum+=sumi ;
00346 }
00347 return sum ;
00348 }
00349
00350 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00351 char* avec, int32_t alen, char* bvec, int32_t blen)
00352 {
00353 float64_t sum = 0.0;
00354
00355 for (int32_t i=0; i<alen; i++)
00356 {
00357 float64_t sumi=0.0;
00358 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00359 {
00360 if (avec[i+j]!=bvec[i+j])
00361 break;
00362 sumi += weights[i*degree+j];
00363 }
00364 if (position_weights!=NULL)
00365 sum += position_weights[i]*sumi ;
00366 else
00367 sum += sumi ;
00368 }
00369
00370 return sum ;
00371 }
00372
00373
00374 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00375 {
00376 int32_t alen, blen;
00377 bool free_avec, free_bvec;
00378 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00379 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00380 float64_t result=0;
00381
00382 if (max_mismatch==0 && length==0 && block_computation)
00383 result=compute_using_block(avec, alen, bvec, blen);
00384 else
00385 {
00386 if (max_mismatch>0)
00387 result=compute_with_mismatch(avec, alen, bvec, blen);
00388 else if (length==0)
00389 result=compute_without_mismatch(avec, alen, bvec, blen);
00390 else
00391 result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00392 }
00393 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00394 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00395
00396 return result;
00397 }
00398
00399
00400 void CWeightedDegreeStringKernel::add_example_to_tree(
00401 int32_t idx, float64_t alpha)
00402 {
00403 ASSERT(alphabet);
00404 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00405
00406 int32_t len=0;
00407 bool free_vec;
00408 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00409 ASSERT(max_mismatch==0);
00410 int32_t *vec=new int32_t[len];
00411
00412 for (int32_t i=0; i<len; i++)
00413 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00414 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00415
00416 if (length == 0 || max_mismatch > 0)
00417 {
00418 for (int32_t i=0; i<len; i++)
00419 {
00420 float64_t alpha_pw=alpha;
00421
00422
00423 if (alpha_pw==0.0)
00424 continue;
00425 ASSERT(tries);
00426 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00427 }
00428 }
00429 else
00430 {
00431 for (int32_t i=0; i<len; i++)
00432 {
00433 float64_t alpha_pw=alpha;
00434
00435
00436 if (alpha_pw==0.0)
00437 continue ;
00438 ASSERT(tries);
00439 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00440 }
00441 }
00442 delete[] vec ;
00443 tree_initialized=true ;
00444 }
00445
00446 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00447 int32_t idx, float64_t alpha, int32_t tree_num)
00448 {
00449 ASSERT(alphabet);
00450 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00451
00452 int32_t len;
00453 bool free_vec;
00454 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00455 ASSERT(max_mismatch==0);
00456 int32_t *vec = new int32_t[len] ;
00457
00458 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00459 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00460 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00461
00462
00463 ASSERT(tries);
00464 if (alpha!=0.0)
00465 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00466
00467 delete[] vec ;
00468 tree_initialized=true ;
00469 }
00470
00471 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00472 {
00473 ASSERT(tries);
00474 ASSERT(alphabet);
00475 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00476
00477 int32_t len ;
00478 bool free_vec;
00479 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00480
00481 int32_t *vec = new int32_t[len] ;
00482
00483 for (int32_t i=0; i<len; i++)
00484 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00485 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00486
00487 for (int32_t i=0; i<len; i++)
00488 {
00489 if (alpha!=0.0)
00490 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00491 }
00492
00493 delete[] vec ;
00494 tree_initialized=true ;
00495 }
00496
00497 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00498 int32_t idx, float64_t alpha, int32_t tree_num)
00499 {
00500 ASSERT(tries);
00501 ASSERT(alphabet);
00502 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00503
00504 int32_t len=0;
00505 bool free_vec;
00506 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00507 int32_t *vec=new int32_t[len];
00508
00509 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00510 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00511 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00512
00513 if (alpha!=0.0)
00514 {
00515 tries->add_example_to_tree_mismatch_recursion(
00516 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00517 0, 0, max_mismatch, weights);
00518 }
00519
00520 delete[] vec;
00521 tree_initialized=true;
00522 }
00523
00524
00525 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00526 {
00527 ASSERT(alphabet);
00528 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00529
00530 int32_t len=0;
00531 bool free_vec;
00532 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00533 ASSERT(char_vec && len>0);
00534 int32_t *vec=new int32_t[len];
00535
00536 for (int32_t i=0; i<len; i++)
00537 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00538 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00539
00540 float64_t sum=0;
00541 ASSERT(tries);
00542 for (int32_t i=0; i<len; i++)
00543 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00544
00545 delete[] vec;
00546 return normalizer->normalize_rhs(sum, idx);
00547 }
00548
00549 void CWeightedDegreeStringKernel::compute_by_tree(
00550 int32_t idx, float64_t* LevelContrib)
00551 {
00552 ASSERT(alphabet);
00553 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00554
00555 int32_t len ;
00556 bool free_vec;
00557 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00558
00559 int32_t *vec = new int32_t[len] ;
00560
00561 for (int32_t i=0; i<len; i++)
00562 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00563 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00564
00565 ASSERT(tries);
00566 for (int32_t i=0; i<len; i++)
00567 {
00568 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00569 normalizer->normalize_rhs(1.0, idx),
00570 mkl_stepsize, weights, (length!=0));
00571 }
00572
00573 delete[] vec ;
00574 }
00575
00576 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00577 {
00578 ASSERT(tries);
00579 return tries->compute_abs_weights(len);
00580 }
00581
00582 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00583 {
00584 ASSERT(degree>0);
00585 ASSERT(p_type==E_WD);
00586
00587 delete[] weights;
00588 weights=new float64_t[degree];
00589 if (weights)
00590 {
00591 int32_t i;
00592 float64_t sum=0;
00593 for (i=0; i<degree; i++)
00594 {
00595 weights[i]=degree-i;
00596 sum+=weights[i];
00597 }
00598 for (i=0; i<degree; i++)
00599 weights[i]/=sum;
00600
00601 for (i=0; i<degree; i++)
00602 {
00603 for (int32_t j=1; j<=max_mismatch; j++)
00604 {
00605 if (j<i+1)
00606 {
00607 int32_t nk=CMath::nchoosek(i+1, j);
00608 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00609 }
00610 else
00611 weights[i+j*degree]= 0;
00612 }
00613 }
00614
00615 if (which_degree>=0)
00616 {
00617 ASSERT(which_degree<degree);
00618 for (i=0; i<degree; i++)
00619 {
00620 if (i!=which_degree)
00621 weights[i]=0;
00622 else
00623 weights[i]=1;
00624 }
00625 }
00626 return true;
00627 }
00628 else
00629 return false;
00630 }
00631
00632 bool CWeightedDegreeStringKernel::set_weights(
00633 float64_t* ws, int32_t d, int32_t len)
00634 {
00635 if (d!=degree || len<0)
00636 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00637
00638 length=len;
00639
00640 if (length==0)
00641 length=1;
00642
00643 int32_t num_weights=degree*(length+max_mismatch);
00644 delete[] weights;
00645 weights=new float64_t[num_weights];
00646
00647 if (weights)
00648 {
00649 for (int32_t i=0; i<num_weights; i++) {
00650 if (ws[i])
00651 weights[i]=ws[i];
00652 }
00653 return true;
00654 }
00655 else
00656 return false;
00657 }
00658
00659 bool CWeightedDegreeStringKernel::set_position_weights(
00660 float64_t* pws, int32_t len)
00661 {
00662 if (len==0)
00663 {
00664 delete[] position_weights;
00665 position_weights=NULL;
00666 ASSERT(tries);
00667 tries->set_position_weights(position_weights);
00668 }
00669
00670 if (seq_length!=len)
00671 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00672
00673 delete[] position_weights;
00674 position_weights=new float64_t[len];
00675 ASSERT(tries);
00676 tries->set_position_weights(position_weights);
00677
00678 if (position_weights)
00679 {
00680 for (int32_t i=0; i<len; i++)
00681 position_weights[i]=pws[i];
00682 return true;
00683 }
00684 else
00685 return false;
00686 }
00687
00688 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00689 {
00690 delete[] block_weights;
00691 block_weights=new float64_t[CMath::max(seq_length,degree)];
00692
00693 if (block_weights)
00694 {
00695 int32_t k;
00696 float64_t d=degree;
00697
00698 for (k=0; k<degree; k++)
00699 block_weights[k]=
00700 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00701 for (k=degree; k<seq_length; k++)
00702 block_weights[k]=(-d+3*k+4)/3;
00703 }
00704
00705 return (block_weights!=NULL);
00706 }
00707
00708 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00709 {
00710 ASSERT(weights);
00711 delete[] block_weights;
00712 block_weights=new float64_t[CMath::max(seq_length,degree)];
00713
00714 if (block_weights)
00715 {
00716 int32_t i=0;
00717 block_weights[0]=weights[0];
00718 for (i=1; i<CMath::max(seq_length,degree); i++)
00719 block_weights[i]=0;
00720
00721 for (i=1; i<CMath::max(seq_length,degree); i++)
00722 {
00723 block_weights[i]=block_weights[i-1];
00724
00725 float64_t contrib=0;
00726 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00727 contrib+=weights[j];
00728
00729 block_weights[i]+=contrib;
00730 }
00731 }
00732
00733 return (block_weights!=NULL);
00734 }
00735
00736 bool CWeightedDegreeStringKernel::init_block_weights_const()
00737 {
00738 delete[] block_weights;
00739 block_weights=new float64_t[seq_length];
00740
00741 if (block_weights)
00742 {
00743 for (int32_t i=1; i<seq_length+1 ; i++)
00744 block_weights[i-1]=1.0/seq_length;
00745 }
00746
00747 return (block_weights!=NULL);
00748 }
00749
00750 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00751 {
00752 delete[] block_weights;
00753 block_weights=new float64_t[seq_length];
00754
00755 if (block_weights)
00756 {
00757 for (int32_t i=1; i<seq_length+1 ; i++)
00758 block_weights[i-1]=degree*i;
00759 }
00760
00761 return (block_weights!=NULL);
00762 }
00763
00764 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00765 {
00766 delete[] block_weights;
00767 block_weights=new float64_t[seq_length];
00768
00769 if (block_weights)
00770 {
00771 for (int32_t i=1; i<degree+1 ; i++)
00772 block_weights[i-1]=((float64_t) i)*i;
00773
00774 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00775 block_weights[i-1]=i;
00776 }
00777
00778 return (block_weights!=NULL);
00779 }
00780
00781 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00782 {
00783 delete[] block_weights;
00784 block_weights=new float64_t[seq_length];
00785
00786 if (block_weights)
00787 {
00788 for (int32_t i=1; i<degree+1 ; i++)
00789 block_weights[i-1]=((float64_t) i)*i*i;
00790
00791 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00792 block_weights[i-1]=i;
00793 }
00794
00795 return (block_weights!=NULL);
00796 }
00797
00798 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00799 {
00800 delete[] block_weights;
00801 block_weights=new float64_t[seq_length];
00802
00803 if (block_weights)
00804 {
00805 for (int32_t i=1; i<degree+1 ; i++)
00806 block_weights[i-1]=exp(((float64_t) i/10.0));
00807
00808 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00809 block_weights[i-1]=i;
00810 }
00811
00812 return (block_weights!=NULL);
00813 }
00814
00815 bool CWeightedDegreeStringKernel::init_block_weights_log()
00816 {
00817 delete[] block_weights;
00818 block_weights=new float64_t[seq_length];
00819
00820 if (block_weights)
00821 {
00822 for (int32_t i=1; i<degree+1 ; i++)
00823 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00824
00825 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00826 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00827 }
00828
00829 return (block_weights!=NULL);
00830 }
00831
00832 bool CWeightedDegreeStringKernel::init_block_weights_external()
00833 {
00834 if (block_weights_external && (seq_length == num_block_weights_external) )
00835 {
00836 delete[] block_weights;
00837 block_weights=new float64_t[seq_length];
00838
00839 if (block_weights)
00840 {
00841 for (int32_t i=0; i<seq_length; i++)
00842 block_weights[i]=block_weights_external[i];
00843 }
00844 }
00845 else {
00846 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00847 }
00848 return (block_weights!=NULL);
00849 }
00850
00851 bool CWeightedDegreeStringKernel::init_block_weights()
00852 {
00853 switch (type)
00854 {
00855 case E_WD:
00856 return init_block_weights_from_wd();
00857 case E_EXTERNAL:
00858 return init_block_weights_from_wd_external();
00859 case E_BLOCK_CONST:
00860 return init_block_weights_const();
00861 case E_BLOCK_LINEAR:
00862 return init_block_weights_linear();
00863 case E_BLOCK_SQPOLY:
00864 return init_block_weights_sqpoly();
00865 case E_BLOCK_CUBICPOLY:
00866 return init_block_weights_cubicpoly();
00867 case E_BLOCK_EXP:
00868 return init_block_weights_exp();
00869 case E_BLOCK_LOG:
00870 return init_block_weights_log();
00871 case E_BLOCK_EXTERNAL:
00872 return init_block_weights_external();
00873 default:
00874 return false;
00875 };
00876 }
00877
00878
00879 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00880 {
00881 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00882 int32_t j=params->j;
00883 CWeightedDegreeStringKernel* wd=params->kernel;
00884 CTrie<DNATrie>* tries=params->tries;
00885 float64_t* weights=params->weights;
00886 int32_t length=params->length;
00887 int32_t* vec=params->vec;
00888 float64_t* result=params->result;
00889 float64_t factor=params->factor;
00890 int32_t* vec_idx=params->vec_idx;
00891
00892 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00893 CAlphabet* alpha=wd->alphabet;
00894
00895 for (int32_t i=params->start; i<params->end; i++)
00896 {
00897 int32_t len=0;
00898 bool free_vec;
00899 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00900 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00901 vec[k]=alpha->remap_to_bin(char_vec[k]);
00902 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00903
00904 ASSERT(tries);
00905
00906 result[i]+=factor*
00907 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00908 }
00909
00910 SG_UNREF(rhs_feat);
00911
00912 return NULL;
00913 }
00914
00915 void CWeightedDegreeStringKernel::compute_batch(
00916 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00917 int32_t* IDX, float64_t* alphas, float64_t factor)
00918 {
00919 ASSERT(tries);
00920 ASSERT(alphabet);
00921 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00922 ASSERT(rhs);
00923 ASSERT(num_vec<=rhs->get_num_vectors());
00924 ASSERT(num_vec>0);
00925 ASSERT(vec_idx);
00926 ASSERT(result);
00927 create_empty_tries();
00928
00929 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00930 ASSERT(num_feat>0);
00931 int32_t num_threads=parallel->get_num_threads();
00932 ASSERT(num_threads>0);
00933 int32_t* vec=new int32_t[num_threads*num_feat];
00934
00935 if (num_threads < 2)
00936 {
00937 #ifdef CYGWIN
00938 for (int32_t j=0; j<num_feat; j++)
00939 #else
00940 CSignal::clear_cancel();
00941 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00942 #endif
00943 {
00944 init_optimization(num_suppvec, IDX, alphas, j);
00945 S_THREAD_PARAM params;
00946 params.vec=vec;
00947 params.result=result;
00948 params.weights=weights;
00949 params.kernel=this;
00950 params.tries=tries;
00951 params.factor=factor;
00952 params.j=j;
00953 params.start=0;
00954 params.end=num_vec;
00955 params.length=length;
00956 params.vec_idx=vec_idx;
00957 compute_batch_helper((void*) ¶ms);
00958
00959 SG_PROGRESS(j,0,num_feat);
00960 }
00961 }
00962 #ifndef WIN32
00963 else
00964 {
00965 CSignal::clear_cancel();
00966 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00967 {
00968 init_optimization(num_suppvec, IDX, alphas, j);
00969 pthread_t* threads = new pthread_t[num_threads-1];
00970 S_THREAD_PARAM* params = new S_THREAD_PARAM[num_threads];
00971 int32_t step= num_vec/num_threads;
00972 int32_t t;
00973
00974 for (t=0; t<num_threads-1; t++)
00975 {
00976 params[t].vec=&vec[num_feat*t];
00977 params[t].result=result;
00978 params[t].weights=weights;
00979 params[t].kernel=this;
00980 params[t].tries=tries;
00981 params[t].factor=factor;
00982 params[t].j=j;
00983 params[t].start = t*step;
00984 params[t].end = (t+1)*step;
00985 params[t].length=length;
00986 params[t].vec_idx=vec_idx;
00987 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]);
00988 }
00989 params[t].vec=&vec[num_feat*t];
00990 params[t].result=result;
00991 params[t].weights=weights;
00992 params[t].kernel=this;
00993 params[t].tries=tries;
00994 params[t].factor=factor;
00995 params[t].j=j;
00996 params[t].start=t*step;
00997 params[t].end=num_vec;
00998 params[t].length=length;
00999 params[t].vec_idx=vec_idx;
01000 compute_batch_helper((void*) ¶ms[t]);
01001
01002 for (t=0; t<num_threads-1; t++)
01003 pthread_join(threads[t], NULL);
01004 SG_PROGRESS(j,0,num_feat);
01005
01006 delete[] params;
01007 delete[] threads;
01008 }
01009 }
01010 #endif
01011
01012 delete[] vec;
01013
01014
01015
01016 create_empty_tries();
01017 }
01018
01019 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
01020 {
01021 if (type==E_EXTERNAL && max!=0) {
01022 return false;
01023 }
01024
01025 max_mismatch=max;
01026
01027 if (lhs!=NULL && rhs!=NULL)
01028 return init(lhs, rhs);
01029 else
01030 return true;
01031 }