00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreePositionStringKernel.h"
00019 #include "kernel/SqrtDiagKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #include "classifier/svm/SVM.h"
00024
00025 #ifndef WIN32
00026 #include <pthread.h>
00027 #endif
00028
00029 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00030
00031 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00032 template <class Trie> struct S_THREAD_PARAM
00033 {
00034 int32_t* vec;
00035 float64_t* result;
00036 float64_t* weights;
00037 CWeightedDegreePositionStringKernel* kernel;
00038 CTrie<Trie>* tries;
00039 float64_t factor;
00040 int32_t j;
00041 int32_t start;
00042 int32_t end;
00043 int32_t length;
00044 int32_t max_shift;
00045 int32_t* shift;
00046 int32_t* vec_idx;
00047 };
00048 #endif // DOXYGEN_SHOULD_SKIP_THIS
00049
00050 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00051 int32_t size, int32_t d, int32_t mm, int32_t mkls)
00052 : CStringKernel<char>(size), weights(NULL), position_weights(NULL),
00053 position_weights_lhs(NULL), position_weights_rhs(NULL),
00054 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00055 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00056 num_block_weights_external(0), block_weights_external(NULL),
00057 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00058 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00059 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00060 alphabet(NULL)
00061 {
00062 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00063 set_wd_weights();
00064 ASSERT(weights);
00065
00066 set_normalizer(new CSqrtDiagKernelNormalizer());
00067 }
00068
00069 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00070 int32_t size, float64_t* w, int32_t d, int32_t mm, int32_t* s, int32_t sl,
00071 int32_t mkls)
00072 : CStringKernel<char>(size), weights(NULL), position_weights(NULL),
00073 position_weights_lhs(NULL), position_weights_rhs(NULL),
00074 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00075 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00076 num_block_weights_external(0), block_weights_external(NULL),
00077 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00078 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00079 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00080 alphabet(NULL)
00081 {
00082 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00083
00084 weights=new float64_t[d*(1+max_mismatch)];
00085 for (int32_t i=0; i<d*(1+max_mismatch); i++)
00086 weights[i]=w[i];
00087
00088 set_shifts(s, sl);
00089
00090 set_normalizer(new CSqrtDiagKernelNormalizer());
00091 }
00092
00093 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00094 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00095 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00096 position_weights_lhs(NULL), position_weights_rhs(NULL),
00097 weights_buffer(NULL), mkl_stepsize(1), degree(d), length(0),
00098 max_mismatch(0), seq_length(0), shift(NULL), shift_len(0),
00099 num_block_weights_external(0), block_weights_external(NULL),
00100 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00101 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00102 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00103 alphabet(NULL)
00104 {
00105 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00106 set_wd_weights();
00107 ASSERT(weights);
00108 set_normalizer(new CSqrtDiagKernelNormalizer());
00109
00110 init(l, r);
00111 }
00112
00113
00114 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00115 {
00116 cleanup();
00117 cleanup_POIM2();
00118
00119 delete[] shift;
00120 shift=NULL;
00121
00122 delete[] weights;
00123 weights=NULL ;
00124
00125 delete[] block_weights;
00126 block_weights=NULL;
00127
00128 delete[] position_weights;
00129 position_weights=NULL;
00130
00131 delete[] position_weights_lhs;
00132 position_weights_lhs=NULL;
00133
00134 delete[] position_weights_rhs;
00135 position_weights_rhs=NULL;
00136
00137 delete[] weights_buffer;
00138 weights_buffer=NULL;
00139 }
00140
00141 void CWeightedDegreePositionStringKernel::remove_lhs()
00142 {
00143 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00144 delete_optimization();
00145
00146 tries.destroy();
00147 poim_tries.destroy();
00148
00149 CKernel::remove_lhs();
00150 }
00151
00152 void CWeightedDegreePositionStringKernel::create_empty_tries()
00153 {
00154 ASSERT(lhs);
00155 seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
00156
00157 if (opt_type==SLOWBUTMEMEFFICIENT)
00158 {
00159 tries.create(seq_length, true);
00160 poim_tries.create(seq_length, true);
00161 }
00162 else if (opt_type==FASTBUTMEMHUNGRY)
00163 {
00164 tries.create(seq_length, false);
00165 poim_tries.create(seq_length, false);
00166 }
00167 else
00168 SG_ERROR( "unknown optimization type\n");
00169 }
00170
00171 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00172 {
00173 int32_t lhs_changed = (lhs!=l) ;
00174 int32_t rhs_changed = (rhs!=r) ;
00175
00176 CStringKernel<char>::init(l,r);
00177
00178 SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00179 SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00180
00181 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00182 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00183
00184
00185 if (shift_len==0) {
00186 shift_len=sf_l->get_vector_length(0);
00187 int32_t *shifts=new int32_t[shift_len];
00188 for (int32_t i=0; i<shift_len; i++) {
00189 shifts[i]=1;
00190 }
00191 set_shifts(shifts, shift_len);
00192 delete[] shifts;
00193 }
00194
00195
00196 int32_t len=sf_l->get_max_vector_length();
00197 if (lhs_changed && !sf_l->have_same_length(len))
00198 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00199
00200 if (rhs_changed && !sf_r->have_same_length(len))
00201 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00202
00203 SG_UNREF(alphabet);
00204 alphabet= sf_l->get_alphabet();
00205 CAlphabet* ralphabet=sf_r->get_alphabet();
00206
00207 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00208 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00209
00210 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00211 SG_UNREF(ralphabet);
00212
00213
00214 create_empty_tries();
00215 init_block_weights();
00216
00217 return init_normalizer();
00218 }
00219
00220 void CWeightedDegreePositionStringKernel::cleanup()
00221 {
00222 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00223 delete_optimization();
00224
00225 delete[] block_weights;
00226 block_weights=NULL;
00227
00228 tries.destroy();
00229 poim_tries.destroy();
00230
00231 seq_length = 0;
00232 tree_initialized = false;
00233
00234 SG_UNREF(alphabet);
00235 alphabet=NULL;
00236
00237 CKernel::cleanup();
00238 }
00239
00240 bool CWeightedDegreePositionStringKernel::load_init(FILE* src)
00241 {
00242 return false;
00243 }
00244
00245 bool CWeightedDegreePositionStringKernel::save_init(FILE* dest)
00246 {
00247 return false;
00248 }
00249
00250 bool CWeightedDegreePositionStringKernel::init_optimization(
00251 int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
00252 int32_t upto_tree)
00253 {
00254 ASSERT(position_weights_lhs==NULL);
00255 ASSERT(position_weights_rhs==NULL);
00256
00257 if (upto_tree<0)
00258 upto_tree=tree_num;
00259
00260 if (max_mismatch!=0)
00261 {
00262 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00263 return false ;
00264 }
00265
00266 if (tree_num<0)
00267 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00268
00269 delete_optimization();
00270
00271 if (tree_num<0)
00272 SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00273
00274 for (int32_t i=0; i<p_count; i++)
00275 {
00276 if (tree_num<0)
00277 {
00278 if ( (i % (p_count/10+1)) == 0)
00279 SG_PROGRESS(i,0,p_count);
00280 add_example_to_tree(IDX[i], alphas[i]);
00281 }
00282 else
00283 {
00284 for (int32_t t=tree_num; t<=upto_tree; t++)
00285 add_example_to_single_tree(IDX[i], alphas[i], t);
00286 }
00287 }
00288
00289 if (tree_num<0)
00290 SG_DONE();
00291
00292 set_is_initialized(true) ;
00293 return true ;
00294 }
00295
00296 bool CWeightedDegreePositionStringKernel::delete_optimization()
00297 {
00298 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00299 {
00300 tries.set_use_compact_terminal_nodes(false) ;
00301 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00302 }
00303
00304 if (get_is_initialized())
00305 {
00306 if (opt_type==SLOWBUTMEMEFFICIENT)
00307 tries.delete_trees(true);
00308 else if (opt_type==FASTBUTMEMHUNGRY)
00309 tries.delete_trees(false);
00310 else {
00311 SG_ERROR( "unknown optimization type\n");
00312 }
00313 set_is_initialized(false);
00314
00315 return true;
00316 }
00317
00318 return false;
00319 }
00320
00321 float64_t CWeightedDegreePositionStringKernel::compute_with_mismatch(
00322 char* avec, int32_t alen, char* bvec, int32_t blen)
00323 {
00324 float64_t* max_shift_vec= new float64_t[max_shift];
00325 float64_t sum0=0 ;
00326 for (int32_t i=0; i<max_shift; i++)
00327 max_shift_vec[i]=0 ;
00328
00329
00330 for (int32_t i=0; i<alen; i++)
00331 {
00332 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00333 continue ;
00334
00335 int32_t mismatches=0;
00336 float64_t sumi = 0.0 ;
00337 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00338 {
00339 if (avec[i+j]!=bvec[i+j])
00340 {
00341 mismatches++ ;
00342 if (mismatches>max_mismatch)
00343 break ;
00344 } ;
00345 sumi += weights[j+degree*mismatches];
00346 }
00347 if (position_weights!=NULL)
00348 sum0 += position_weights[i]*sumi ;
00349 else
00350 sum0 += sumi ;
00351 } ;
00352
00353 for (int32_t i=0; i<alen; i++)
00354 {
00355 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00356 {
00357 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00358 continue ;
00359
00360 float64_t sumi1 = 0.0 ;
00361
00362 int32_t mismatches=0;
00363 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00364 {
00365 if (avec[i+j+k]!=bvec[i+j])
00366 {
00367 mismatches++ ;
00368 if (mismatches>max_mismatch)
00369 break ;
00370 } ;
00371 sumi1 += weights[j+degree*mismatches];
00372 }
00373 float64_t sumi2 = 0.0 ;
00374
00375 mismatches=0;
00376 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00377 {
00378 if (avec[i+j]!=bvec[i+j+k])
00379 {
00380 mismatches++ ;
00381 if (mismatches>max_mismatch)
00382 break ;
00383 } ;
00384 sumi2 += weights[j+degree*mismatches];
00385 }
00386 if (position_weights!=NULL)
00387 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00388 else
00389 max_shift_vec[k-1] += sumi1 + sumi2 ;
00390 } ;
00391 }
00392
00393 float64_t result = sum0 ;
00394 for (int32_t i=0; i<max_shift; i++)
00395 result += max_shift_vec[i]/(2*(i+1)) ;
00396
00397 delete[] max_shift_vec;
00398 return result ;
00399 }
00400
00401 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch(
00402 char* avec, int32_t alen, char* bvec, int32_t blen)
00403 {
00404 float64_t* max_shift_vec = new float64_t[max_shift];
00405 float64_t sum0=0 ;
00406 for (int32_t i=0; i<max_shift; i++)
00407 max_shift_vec[i]=0 ;
00408
00409
00410 for (int32_t i=0; i<alen; i++)
00411 {
00412 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00413 continue ;
00414
00415 float64_t sumi = 0.0 ;
00416 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00417 {
00418 if (avec[i+j]!=bvec[i+j])
00419 break ;
00420 sumi += weights[j];
00421 }
00422 if (position_weights!=NULL)
00423 sum0 += position_weights[i]*sumi ;
00424 else
00425 sum0 += sumi ;
00426 } ;
00427
00428 for (int32_t i=0; i<alen; i++)
00429 {
00430 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00431 {
00432 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00433 continue ;
00434
00435 float64_t sumi1 = 0.0 ;
00436
00437 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00438 {
00439 if (avec[i+j+k]!=bvec[i+j])
00440 break ;
00441 sumi1 += weights[j];
00442 }
00443 float64_t sumi2 = 0.0 ;
00444
00445 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00446 {
00447 if (avec[i+j]!=bvec[i+j+k])
00448 break ;
00449 sumi2 += weights[j];
00450 }
00451 if (position_weights!=NULL)
00452 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00453 else
00454 max_shift_vec[k-1] += sumi1 + sumi2 ;
00455 } ;
00456 }
00457
00458 float64_t result = sum0 ;
00459 for (int32_t i=0; i<max_shift; i++)
00460 result += max_shift_vec[i]/(2*(i+1)) ;
00461
00462 delete[] max_shift_vec;
00463 return result ;
00464 }
00465
00466 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(
00467 char* avec, int32_t alen, char* bvec, int32_t blen)
00468 {
00469 float64_t* max_shift_vec = new float64_t[max_shift];
00470 float64_t sum0=0 ;
00471 for (int32_t i=0; i<max_shift; i++)
00472 max_shift_vec[i]=0 ;
00473
00474
00475 for (int32_t i=0; i<alen; i++)
00476 {
00477 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00478 continue ;
00479 float64_t sumi = 0.0 ;
00480 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00481 {
00482 if (avec[i+j]!=bvec[i+j])
00483 break ;
00484 sumi += weights[i*degree+j];
00485 }
00486 if (position_weights!=NULL)
00487 sum0 += position_weights[i]*sumi ;
00488 else
00489 sum0 += sumi ;
00490 } ;
00491
00492 for (int32_t i=0; i<alen; i++)
00493 {
00494 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00495 {
00496 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00497 continue ;
00498
00499 float64_t sumi1 = 0.0 ;
00500
00501 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00502 {
00503 if (avec[i+j+k]!=bvec[i+j])
00504 break ;
00505 sumi1 += weights[i*degree+j];
00506 }
00507 float64_t sumi2 = 0.0 ;
00508
00509 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00510 {
00511 if (avec[i+j]!=bvec[i+j+k])
00512 break ;
00513 sumi2 += weights[i*degree+j];
00514 }
00515 if (position_weights!=NULL)
00516 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00517 else
00518 max_shift_vec[k-1] += sumi1 + sumi2 ;
00519 } ;
00520 }
00521
00522 float64_t result = sum0 ;
00523 for (int32_t i=0; i<max_shift; i++)
00524 result += max_shift_vec[i]/(2*(i+1)) ;
00525
00526 delete[] max_shift_vec;
00527 return result ;
00528 }
00529
00530 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(
00531 char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
00532 float64_t* pos_weights_rhs, int32_t blen)
00533 {
00534
00535 float64_t* max_shift_vec = new float64_t[max_shift];
00536 float64_t sum0=0 ;
00537 for (int32_t i=0; i<max_shift; i++)
00538 max_shift_vec[i]=0 ;
00539
00540
00541 for (int32_t i=0; i<alen; i++)
00542 {
00543 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00544 continue ;
00545
00546 float64_t sumi = 0.0 ;
00547 float64_t posweight_lhs = 0.0 ;
00548 float64_t posweight_rhs = 0.0 ;
00549 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00550 {
00551 posweight_lhs += pos_weights_lhs[i+j] ;
00552 posweight_rhs += pos_weights_rhs[i+j] ;
00553
00554 if (avec[i+j]!=bvec[i+j])
00555 break ;
00556 sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00557 }
00558 if (position_weights!=NULL)
00559 sum0 += position_weights[i]*sumi ;
00560 else
00561 sum0 += sumi ;
00562 } ;
00563
00564 for (int32_t i=0; i<alen; i++)
00565 {
00566 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00567 {
00568 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00569 continue ;
00570
00571
00572 float64_t sumi1 = 0.0 ;
00573 float64_t posweight_lhs = 0.0 ;
00574 float64_t posweight_rhs = 0.0 ;
00575 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00576 {
00577 posweight_lhs += pos_weights_lhs[i+j+k] ;
00578 posweight_rhs += pos_weights_rhs[i+j] ;
00579 if (avec[i+j+k]!=bvec[i+j])
00580 break ;
00581 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00582 }
00583
00584 float64_t sumi2 = 0.0 ;
00585 posweight_lhs = 0.0 ;
00586 posweight_rhs = 0.0 ;
00587 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00588 {
00589 posweight_lhs += pos_weights_lhs[i+j] ;
00590 posweight_rhs += pos_weights_rhs[i+j+k] ;
00591 if (avec[i+j]!=bvec[i+j+k])
00592 break ;
00593 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00594 }
00595 if (position_weights!=NULL)
00596 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00597 else
00598 max_shift_vec[k-1] += sumi1 + sumi2 ;
00599 } ;
00600 }
00601
00602 float64_t result = sum0 ;
00603 for (int32_t i=0; i<max_shift; i++)
00604 result += max_shift_vec[i]/(2*(i+1)) ;
00605
00606 delete[] max_shift_vec;
00607 return result ;
00608 }
00609
00610
00611 float64_t CWeightedDegreePositionStringKernel::compute(
00612 int32_t idx_a, int32_t idx_b)
00613 {
00614 int32_t alen, blen;
00615
00616 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00617 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00618
00619 ASSERT(alen==blen);
00620 ASSERT(shift_len==alen);
00621
00622 float64_t result = 0 ;
00623 if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00624 {
00625 ASSERT(max_mismatch==0);
00626 float64_t* position_weights_rhs_ = position_weights_rhs ;
00627 if (lhs==rhs)
00628 {
00629
00630 position_weights_rhs_ = position_weights_lhs ;
00631 }
00632 else
00633 {
00634
00635 }
00636
00637 result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs_[idx_b*blen], blen) ;
00638 }
00639 else if (max_mismatch > 0)
00640 result = compute_with_mismatch(avec, alen, bvec, blen) ;
00641 else if (length==0)
00642 result = compute_without_mismatch(avec, alen, bvec, blen) ;
00643 else
00644 result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00645
00646 return result ;
00647 }
00648
00649
00650 void CWeightedDegreePositionStringKernel::add_example_to_tree(
00651 int32_t idx, float64_t alpha)
00652 {
00653 ASSERT(position_weights_lhs==NULL);
00654 ASSERT(position_weights_rhs==NULL);
00655 ASSERT(alphabet);
00656 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00657
00658 int32_t len=0;
00659 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00660 ASSERT(max_mismatch==0);
00661 int32_t *vec = new int32_t[len] ;
00662
00663 for (int32_t i=0; i<len; i++)
00664 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00665
00666 if (opt_type==FASTBUTMEMHUNGRY)
00667 {
00668
00669 ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00670 }
00671
00672 for (int32_t i=0; i<len; i++)
00673 {
00674 int32_t max_s=-1;
00675
00676 if (opt_type==SLOWBUTMEMEFFICIENT)
00677 max_s=0;
00678 else if (opt_type==FASTBUTMEMHUNGRY)
00679 max_s=shift[i];
00680 else {
00681 SG_ERROR( "unknown optimization type\n");
00682 }
00683
00684 for (int32_t s=max_s; s>=0; s--)
00685 {
00686 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00687 TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00688
00689
00690 if ((s==0) || (i+s>=len))
00691 continue;
00692
00693 TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00694
00695 }
00696 }
00697
00698 delete[] vec ;
00699 tree_initialized=true ;
00700 }
00701
00702 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(
00703 int32_t idx, float64_t alpha, int32_t tree_num)
00704 {
00705 ASSERT(position_weights_lhs==NULL);
00706 ASSERT(position_weights_rhs==NULL);
00707 ASSERT(alphabet);
00708 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00709
00710 int32_t len=0;
00711 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00712 ASSERT(max_mismatch==0);
00713 int32_t *vec=new int32_t[len];
00714 int32_t max_s=-1;
00715
00716 if (opt_type==SLOWBUTMEMEFFICIENT)
00717 max_s=0;
00718 else if (opt_type==FASTBUTMEMHUNGRY)
00719 {
00720 ASSERT(!tries.get_use_compact_terminal_nodes());
00721 max_s=shift[tree_num];
00722 }
00723 else {
00724 SG_ERROR( "unknown optimization type\n");
00725 }
00726 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+degree+max_shift); i++)
00727 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00728
00729 for (int32_t s=max_s; s>=0; s--)
00730 {
00731 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00732 tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00733
00734 }
00735
00736 if (opt_type==FASTBUTMEMHUNGRY)
00737 {
00738 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00739 {
00740 int32_t s=tree_num-i;
00741 if ((i+s<len) && (s>=1) && (s<=shift[i]))
00742 {
00743 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00744 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
00745
00746 }
00747 }
00748 }
00749 delete[] vec ;
00750 tree_initialized=true ;
00751 }
00752
00753 float64_t CWeightedDegreePositionStringKernel::compute_by_tree(int32_t idx)
00754 {
00755 ASSERT(position_weights_lhs==NULL);
00756 ASSERT(position_weights_rhs==NULL);
00757 ASSERT(alphabet);
00758 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00759
00760 float64_t sum=0;
00761 int32_t len=0;
00762 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00763 ASSERT(max_mismatch==0);
00764 int32_t *vec=new int32_t[len];
00765
00766 for (int32_t i=0; i<len; i++)
00767 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00768
00769 for (int32_t i=0; i<len; i++)
00770 sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00771
00772 if (opt_type==SLOWBUTMEMEFFICIENT)
00773 {
00774 for (int32_t i=0; i<len; i++)
00775 {
00776 for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
00777 {
00778 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00779 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00780 }
00781 }
00782 }
00783
00784 delete[] vec ;
00785
00786 return normalizer->normalize_rhs(sum, idx);
00787 }
00788
00789 void CWeightedDegreePositionStringKernel::compute_by_tree(
00790 int32_t idx, float64_t* LevelContrib)
00791 {
00792 ASSERT(position_weights_lhs==NULL);
00793 ASSERT(position_weights_rhs==NULL);
00794 ASSERT(alphabet);
00795 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00796
00797 int32_t len=0;
00798 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00799 ASSERT(max_mismatch==0);
00800 int32_t *vec=new int32_t[len];
00801
00802 for (int32_t i=0; i<len; i++)
00803 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00804
00805 for (int32_t i=0; i<len; i++)
00806 {
00807 tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00808 normalizer->normalize_rhs(1.0, idx), mkl_stepsize, weights,
00809 (length!=0));
00810 }
00811
00812 if (opt_type==SLOWBUTMEMEFFICIENT)
00813 {
00814 for (int32_t i=0; i<len; i++)
00815 for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
00816 {
00817 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
00818 normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
00819 weights, (length!=0)) ;
00820 tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
00821 LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
00822 mkl_stepsize, weights, (length!=0)) ;
00823 }
00824 }
00825
00826 delete[] vec ;
00827 }
00828
00829 float64_t* CWeightedDegreePositionStringKernel::compute_abs_weights(
00830 int32_t &len)
00831 {
00832 return tries.compute_abs_weights(len);
00833 }
00834
00835 bool CWeightedDegreePositionStringKernel::set_shifts(
00836 int32_t* shift_, int32_t shift_len_)
00837 {
00838 delete[] shift;
00839
00840 shift_len = shift_len_ ;
00841 shift = new int32_t[shift_len] ;
00842
00843 if (shift)
00844 {
00845 max_shift = 0 ;
00846
00847 for (int32_t i=0; i<shift_len; i++)
00848 {
00849 shift[i] = shift_[i] ;
00850 if (shift[i]>max_shift)
00851 max_shift = shift[i] ;
00852 }
00853
00854 ASSERT(max_shift>=0 && max_shift<=shift_len);
00855 }
00856
00857 return false;
00858 }
00859
00860 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00861 {
00862 ASSERT(degree>0);
00863
00864 delete[] weights;
00865 weights=new float64_t[degree];
00866 if (weights)
00867 {
00868 int32_t i;
00869 float64_t sum=0;
00870 for (i=0; i<degree; i++)
00871 {
00872 weights[i]=degree-i;
00873 sum+=weights[i];
00874 }
00875 for (i=0; i<degree; i++)
00876 weights[i]/=sum;
00877
00878 for (i=0; i<degree; i++)
00879 {
00880 for (int32_t j=1; j<=max_mismatch; j++)
00881 {
00882 if (j<i+1)
00883 {
00884 int32_t nk=CMath::nchoosek(i+1, j);
00885 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00886 }
00887 else
00888 weights[i+j*degree]= 0;
00889 }
00890 }
00891
00892 return true;
00893 }
00894 else
00895 return false;
00896 }
00897
00898 bool CWeightedDegreePositionStringKernel::set_weights(
00899 float64_t* ws, int32_t d, int32_t len)
00900 {
00901 SG_DEBUG("degree = %i d=%i\n", degree, d);
00902 degree=d;
00903 length=len;
00904
00905 if (len <= 0)
00906 len=1;
00907
00908 delete[] weights;
00909 weights=new float64_t[d*len];
00910
00911 if (weights)
00912 {
00913 for (int32_t i=0; i<degree*len; i++)
00914 weights[i]=ws[i];
00915 return true;
00916 }
00917 else
00918 return false;
00919 }
00920
00921 bool CWeightedDegreePositionStringKernel::set_position_weights(
00922 float64_t* pws, int32_t len)
00923 {
00924
00925
00926 if (len==0)
00927 {
00928 delete[] position_weights;
00929 position_weights = NULL;
00930 tries.set_position_weights(position_weights);
00931 return true;
00932 }
00933 if (seq_length==0)
00934 seq_length=len;
00935
00936 if (seq_length!=len)
00937 {
00938 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00939 return false;
00940 }
00941 delete[] position_weights;
00942 position_weights=new float64_t[len];
00943 tries.set_position_weights(position_weights);
00944
00945 if (position_weights)
00946 {
00947 for (int32_t i=0; i<len; i++)
00948 position_weights[i]=pws[i];
00949 return true;
00950 }
00951 else
00952 return false;
00953 }
00954
00955 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num)
00956 {
00957 fprintf(stderr, "set_position_weights_lhs %i %i %i\n", len, num, seq_length);
00958
00959 if (position_weights_rhs==position_weights_lhs)
00960 position_weights_rhs=NULL;
00961 else
00962 delete_position_weights_rhs();
00963
00964 if (len==0)
00965 {
00966 return delete_position_weights_lhs();
00967 }
00968
00969 if (seq_length!=len)
00970 {
00971 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00972 return false;
00973 }
00974 if (0)
00975 {
00976
00977 if (!lhs)
00978 {
00979 SG_ERROR("lhs=NULL\n");
00980 return false ;
00981 }
00982 if (lhs->get_num_vectors()!=num)
00983 {
00984 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num);
00985 return false;
00986 }
00987 }
00988
00989 delete[] position_weights_lhs;
00990 position_weights_lhs=new float64_t[len*num];
00991 if (position_weights_lhs)
00992 {
00993 for (int32_t i=0; i<len*num; i++)
00994 position_weights_lhs[i]=pws[i];
00995 return true;
00996 }
00997 else
00998 return false;
00999 }
01000
01001 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(
01002 float64_t* pws, int32_t len, int32_t num)
01003 {
01004 fprintf(stderr, "set_position_weights_rhs %i %i %i\n", len, num, seq_length);
01005 if (len==0)
01006 {
01007 if (position_weights_rhs==position_weights_lhs)
01008 {
01009 position_weights_rhs=NULL;
01010 return true;
01011 }
01012 return delete_position_weights_rhs();
01013 }
01014
01015 if (seq_length!=len)
01016 {
01017 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
01018 return false;
01019 }
01020 if (0)
01021 {
01022
01023 if (!rhs)
01024 {
01025 if (!lhs)
01026 {
01027 SG_ERROR("rhs=NULL and lhs=NULL\n");
01028 return false;
01029 }
01030 if (lhs->get_num_vectors()!=num)
01031 {
01032 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num);
01033 return false;
01034 }
01035 } else
01036 if (rhs->get_num_vectors()!=num)
01037 {
01038 SG_ERROR("rhs->get_num_vectors()=%i, num=%i\n", rhs->get_num_vectors(), num);
01039 return false;
01040 }
01041 }
01042
01043 delete[] position_weights_rhs;
01044 position_weights_rhs=new float64_t[len*num];
01045 if (position_weights_rhs)
01046 {
01047 for (int32_t i=0; i<len*num; i++)
01048 position_weights_rhs[i]=pws[i];
01049 return true;
01050 }
01051 else
01052 return false;
01053 }
01054
01055 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01056 {
01057 delete[] block_weights;
01058 block_weights=new float64_t[CMath::max(seq_length,degree)];
01059
01060 if (block_weights)
01061 {
01062 int32_t k;
01063 float64_t d=degree;
01064
01065 for (k=0; k<degree; k++)
01066 block_weights[k]=
01067 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
01068 for (k=degree; k<seq_length; k++)
01069 block_weights[k]=(-d+3*k+4)/3;
01070 }
01071
01072 return (block_weights!=NULL);
01073 }
01074
01075 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01076 {
01077 ASSERT(weights);
01078 delete[] block_weights;
01079 block_weights=new float64_t[CMath::max(seq_length,degree)];
01080
01081 if (block_weights)
01082 {
01083 int32_t i=0;
01084 block_weights[0]=weights[0];
01085 for (i=1; i<CMath::max(seq_length,degree); i++)
01086 block_weights[i]=0;
01087
01088 for (i=1; i<CMath::max(seq_length,degree); i++)
01089 {
01090 block_weights[i]=block_weights[i-1];
01091
01092 float64_t contrib=0;
01093 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
01094 contrib+=weights[j];
01095
01096 block_weights[i]+=contrib;
01097 }
01098 }
01099
01100 return (block_weights!=NULL);
01101 }
01102
01103 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01104 {
01105 delete[] block_weights;
01106 block_weights=new float64_t[seq_length];
01107
01108 if (block_weights)
01109 {
01110 for (int32_t i=1; i<seq_length+1 ; i++)
01111 block_weights[i-1]=1.0/seq_length;
01112 }
01113
01114 return (block_weights!=NULL);
01115 }
01116
01117 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01118 {
01119 delete[] block_weights;
01120 block_weights=new float64_t[seq_length];
01121
01122 if (block_weights)
01123 {
01124 for (int32_t i=1; i<seq_length+1 ; i++)
01125 block_weights[i-1]=degree*i;
01126 }
01127
01128 return (block_weights!=NULL);
01129 }
01130
01131 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01132 {
01133 delete[] block_weights;
01134 block_weights=new float64_t[seq_length];
01135
01136 if (block_weights)
01137 {
01138 for (int32_t i=1; i<degree+1 ; i++)
01139 block_weights[i-1]=((float64_t) i)*i;
01140
01141 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01142 block_weights[i-1]=i;
01143 }
01144
01145 return (block_weights!=NULL);
01146 }
01147
01148 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01149 {
01150 delete[] block_weights;
01151 block_weights=new float64_t[seq_length];
01152
01153 if (block_weights)
01154 {
01155 for (int32_t i=1; i<degree+1 ; i++)
01156 block_weights[i-1]=((float64_t) i)*i*i;
01157
01158 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01159 block_weights[i-1]=i;
01160 }
01161
01162 return (block_weights!=NULL);
01163 }
01164
01165 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01166 {
01167 delete[] block_weights;
01168 block_weights=new float64_t[seq_length];
01169
01170 if (block_weights)
01171 {
01172 for (int32_t i=1; i<degree+1 ; i++)
01173 block_weights[i-1]=exp(((float64_t) i/10.0));
01174
01175 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01176 block_weights[i-1]=i;
01177 }
01178
01179 return (block_weights!=NULL);
01180 }
01181
01182 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01183 {
01184 delete[] block_weights;
01185 block_weights=new float64_t[seq_length];
01186
01187 if (block_weights)
01188 {
01189 for (int32_t i=1; i<degree+1 ; i++)
01190 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
01191
01192 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01193 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
01194 }
01195
01196 return (block_weights!=NULL);
01197 }
01198
01199 bool CWeightedDegreePositionStringKernel::init_block_weights_external()
01200 {
01201 if (block_weights_external && (seq_length == num_block_weights_external) )
01202 {
01203 delete[] block_weights;
01204 block_weights=new float64_t[seq_length];
01205
01206 if (block_weights)
01207 {
01208 for (int32_t i=0; i<seq_length; i++)
01209 block_weights[i]=block_weights_external[i];
01210 }
01211 }
01212 else {
01213 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
01214 }
01215 return (block_weights!=NULL);
01216 }
01217
01218 bool CWeightedDegreePositionStringKernel::init_block_weights()
01219 {
01220 switch (type)
01221 {
01222 case E_WD:
01223 return init_block_weights_from_wd();
01224 case E_EXTERNAL:
01225 return init_block_weights_from_wd_external();
01226 case E_BLOCK_CONST:
01227 return init_block_weights_const();
01228 case E_BLOCK_LINEAR:
01229 return init_block_weights_linear();
01230 case E_BLOCK_SQPOLY:
01231 return init_block_weights_sqpoly();
01232 case E_BLOCK_CUBICPOLY:
01233 return init_block_weights_cubicpoly();
01234 case E_BLOCK_EXP:
01235 return init_block_weights_exp();
01236 case E_BLOCK_LOG:
01237 return init_block_weights_log();
01238 case E_BLOCK_EXTERNAL:
01239 return init_block_weights_external();
01240 default:
01241 return false;
01242 };
01243 }
01244
01245
01246
01247 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01248 {
01249 S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01250 int32_t j=params->j;
01251 CWeightedDegreePositionStringKernel* wd=params->kernel;
01252 CTrie<DNATrie>* tries=params->tries;
01253 float64_t* weights=params->weights;
01254 int32_t length=params->length;
01255 int32_t max_shift=params->max_shift;
01256 int32_t* vec=params->vec;
01257 float64_t* result=params->result;
01258 float64_t factor=params->factor;
01259 int32_t* shift=params->shift;
01260 int32_t* vec_idx=params->vec_idx;
01261
01262 for (int32_t i=params->start; i<params->end; i++)
01263 {
01264 int32_t len=0;
01265 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
01266 CAlphabet* alpha=wd->alphabet;
01267
01268 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
01269 for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01270 vec[k]=alpha->remap_to_bin(char_vec[k]);
01271
01272 SG_UNREF(rhs_feat);
01273
01274 result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
01275
01276 if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01277 {
01278 for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01279 {
01280 int32_t s=j-q ;
01281 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01282 {
01283 result[i] +=
01284 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01285 len, q, q+s, q, weights, (length!=0)),
01286 vec_idx[i])/(2.0*s);
01287 }
01288 }
01289
01290 for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
01291 {
01292 result[i] +=
01293 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01294 len, j+s, j, j+s, weights, (length!=0)),
01295 vec_idx[i])/(2.0*s);
01296 }
01297 }
01298 }
01299
01300 return NULL;
01301 }
01302
01303 void CWeightedDegreePositionStringKernel::compute_batch(
01304 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
01305 int32_t* IDX, float64_t* alphas, float64_t factor)
01306 {
01307 ASSERT(alphabet);
01308 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01309 ASSERT(position_weights_lhs==NULL);
01310 ASSERT(position_weights_rhs==NULL);
01311 ASSERT(rhs);
01312 ASSERT(num_vec<=rhs->get_num_vectors());
01313 ASSERT(num_vec>0);
01314 ASSERT(vec_idx);
01315 ASSERT(result);
01316 create_empty_tries();
01317
01318 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01319 ASSERT(num_feat>0);
01320 int32_t num_threads=parallel->get_num_threads();
01321 ASSERT(num_threads>0);
01322 int32_t* vec=new int32_t[num_threads*num_feat];
01323
01324 if (num_threads < 2)
01325 {
01326 #ifdef WIN32
01327 for (int32_t j=0; j<num_feat; j++)
01328 #else
01329 CSignal::clear_cancel();
01330 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01331 #endif
01332 {
01333 init_optimization(num_suppvec, IDX, alphas, j);
01334 S_THREAD_PARAM<DNATrie> params;
01335 params.vec=vec;
01336 params.result=result;
01337 params.weights=weights;
01338 params.kernel=this;
01339 params.tries=&tries;
01340 params.factor=factor;
01341 params.j=j;
01342 params.start=0;
01343 params.end=num_vec;
01344 params.length=length;
01345 params.max_shift=max_shift;
01346 params.shift=shift;
01347 params.vec_idx=vec_idx;
01348 compute_batch_helper((void*) ¶ms);
01349
01350 SG_PROGRESS(j,0,num_feat);
01351 }
01352 }
01353 #ifndef WIN32
01354 else
01355 {
01356
01357 CSignal::clear_cancel();
01358 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01359 {
01360 init_optimization(num_suppvec, IDX, alphas, j);
01361 pthread_t threads[num_threads-1];
01362 S_THREAD_PARAM<DNATrie> params[num_threads];
01363 int32_t step= num_vec/num_threads;
01364 int32_t t;
01365
01366 for (t=0; t<num_threads-1; t++)
01367 {
01368 params[t].vec=&vec[num_feat*t];
01369 params[t].result=result;
01370 params[t].weights=weights;
01371 params[t].kernel=this;
01372 params[t].tries=&tries;
01373 params[t].factor=factor;
01374 params[t].j=j;
01375 params[t].start = t*step;
01376 params[t].end = (t+1)*step;
01377 params[t].length=length;
01378 params[t].max_shift=max_shift;
01379 params[t].shift=shift;
01380 params[t].vec_idx=vec_idx;
01381 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)¶ms[t]);
01382 }
01383
01384 params[t].vec=&vec[num_feat*t];
01385 params[t].result=result;
01386 params[t].weights=weights;
01387 params[t].kernel=this;
01388 params[t].tries=&tries;
01389 params[t].factor=factor;
01390 params[t].j=j;
01391 params[t].start=t*step;
01392 params[t].end=num_vec;
01393 params[t].length=length;
01394 params[t].max_shift=max_shift;
01395 params[t].shift=shift;
01396 params[t].vec_idx=vec_idx;
01397 compute_batch_helper((void*) ¶ms[t]);
01398
01399 for (t=0; t<num_threads-1; t++)
01400 pthread_join(threads[t], NULL);
01401 SG_PROGRESS(j,0,num_feat);
01402 }
01403 }
01404 #endif
01405
01406 delete[] vec;
01407
01408
01409
01410 create_empty_tries();
01411 }
01412
01413 float64_t* CWeightedDegreePositionStringKernel::compute_scoring(
01414 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
01415 int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01416 {
01417 ASSERT(position_weights_lhs==NULL);
01418 ASSERT(position_weights_rhs==NULL);
01419
01420 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01421 ASSERT(num_feat>0);
01422 ASSERT(alphabet);
01423 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01424
01425 num_sym=4;
01426
01427 ASSERT(max_degree>0);
01428
01429
01430 int32_t* nofsKmers=new int32_t[max_degree];
01431 float64_t** C=new float64_t*[max_degree];
01432 float64_t** L=new float64_t*[max_degree];
01433 float64_t** R=new float64_t*[max_degree];
01434
01435 int32_t i;
01436 int32_t k;
01437
01438
01439 int32_t bigtabSize=0;
01440 for (k=0; k<max_degree; ++k )
01441 {
01442 nofsKmers[k]=(int32_t) CMath::pow(num_sym, k+1);
01443 const int32_t tabSize=nofsKmers[k]*num_feat;
01444 bigtabSize+=tabSize;
01445 }
01446 result=new float64_t[bigtabSize];
01447
01448
01449 int32_t tabOffs=0;
01450 for( k = 0; k < max_degree; ++k )
01451 {
01452 const int32_t tabSize = nofsKmers[k] * num_feat;
01453 C[k] = &result[tabOffs];
01454 L[k] = new float64_t[ tabSize ];
01455 R[k] = new float64_t[ tabSize ];
01456 tabOffs+=tabSize;
01457 for(i = 0; i < tabSize; i++ )
01458 {
01459 C[k][i] = 0.0;
01460 L[k][i] = 0.0;
01461 R[k][i] = 0.0;
01462 }
01463 }
01464
01465
01466 float64_t* margFactors=new float64_t[degree];
01467
01468 int32_t* x = new int32_t[ degree+1 ];
01469 int32_t* substrs = new int32_t[ degree+1 ];
01470
01471 margFactors[0] = 1.0;
01472 substrs[0] = 0;
01473 for( k=1; k < degree; ++k ) {
01474 margFactors[k] = 0.25 * margFactors[k-1];
01475 substrs[k] = -1;
01476 }
01477 substrs[degree] = -1;
01478
01479 struct TreeParseInfo info;
01480 info.num_sym = num_sym;
01481 info.num_feat = num_feat;
01482 info.p = -1;
01483 info.k = -1;
01484 info.nofsKmers = nofsKmers;
01485 info.margFactors = margFactors;
01486 info.x = x;
01487 info.substrs = substrs;
01488 info.y0 = 0;
01489 info.C_k = NULL;
01490 info.L_k = NULL;
01491 info.R_k = NULL;
01492
01493
01494 i = 0;
01495 for( k = 0; k < max_degree; ++k )
01496 {
01497 const int32_t nofKmers = nofsKmers[ k ];
01498 info.C_k = C[k];
01499 info.L_k = L[k];
01500 info.R_k = R[k];
01501
01502
01503 for(int32_t p = 0; p < num_feat; ++p )
01504 {
01505 init_optimization( num_suppvec, IDX, alphas, p );
01506 int32_t tree = p ;
01507 for(int32_t j = 0; j < degree+1; j++ ) {
01508 x[j] = -1;
01509 }
01510 tries.traverse( tree, p, info, 0, x, k );
01511 SG_PROGRESS(i++,0,num_feat*max_degree);
01512 }
01513
01514
01515 if( k > 0 ) {
01516 const int32_t j = k - 1;
01517 const int32_t nofJmers = (int32_t) CMath::pow( num_sym, j+1 );
01518 for(int32_t p = 0; p < num_feat; ++p ) {
01519 const int32_t offsetJ = nofJmers * p;
01520 const int32_t offsetJ1 = nofJmers * (p+1);
01521 const int32_t offsetK = nofKmers * p;
01522 int32_t y;
01523 int32_t sym;
01524 for( y = 0; y < nofJmers; ++y ) {
01525 for( sym = 0; sym < num_sym; ++sym ) {
01526 const int32_t y_sym = num_sym*y + sym;
01527 const int32_t sym_y = nofJmers*sym + y;
01528 ASSERT(0<=y_sym && y_sym<nofKmers);
01529 ASSERT(0<=sym_y && sym_y<nofKmers);
01530 C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01531 if( p < num_feat-1 ) {
01532 C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01533 }
01534 }
01535 }
01536 }
01537 }
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549 }
01550
01551
01552 num_feat=1;
01553 num_sym = bigtabSize;
01554
01555 delete[] nofsKmers;
01556 delete[] margFactors;
01557 delete[] substrs;
01558 delete[] x;
01559 delete[] C;
01560 for( k = 0; k < max_degree; ++k ) {
01561 delete[] L[k];
01562 delete[] R[k];
01563 }
01564 delete[] L;
01565 delete[] R;
01566 return result;
01567 }
01568
01569 char* CWeightedDegreePositionStringKernel::compute_consensus(
01570 int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01571 {
01572 ASSERT(position_weights_lhs==NULL);
01573 ASSERT(position_weights_rhs==NULL);
01574
01575 ASSERT(degree<=32);
01576 ASSERT(!tries.get_use_compact_terminal_nodes());
01577 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01578 ASSERT(num_feat>0);
01579 ASSERT(alphabet);
01580 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01581
01582
01583 char* result=new char[num_feat];
01584
01585
01586 int32_t num_tables=CMath::max(1,num_feat-degree+1);
01587 CDynamicArray<ConsensusEntry>** table=new CDynamicArray<ConsensusEntry>*[num_tables];
01588
01589 for (int32_t i=0; i<num_tables; i++)
01590 table[i]=new CDynamicArray<ConsensusEntry>(num_suppvec/10);
01591
01592
01593 for (int32_t i=0; i<num_tables; i++)
01594 {
01595 bool cumulative=false;
01596
01597 if (i<num_tables-1)
01598 init_optimization(num_suppvec, IDX, alphas, i);
01599 else
01600 {
01601 init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01602 cumulative=true;
01603 }
01604
01605 if (i==0)
01606 tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01607 else
01608 tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01609
01610 SG_PROGRESS(i,0,num_feat);
01611 }
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628
01629
01630
01631
01632
01633
01634
01635 const char* acgt="ACGT";
01636
01637
01638 int32_t max_idx=-1;
01639 float32_t max_score=0;
01640 int32_t num_elements=table[num_tables-1]->get_num_elements();
01641
01642 for (int32_t i=0; i<num_elements; i++)
01643 {
01644 float64_t sc=table[num_tables-1]->get_element(i).score;
01645 if (sc>max_score || max_idx==-1)
01646 {
01647 max_idx=i;
01648 max_score=sc;
01649 }
01650 }
01651 uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
01652
01653 SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01654
01655 for (int32_t i=0; i<degree; i++)
01656 result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01657
01658 if (num_tables>1)
01659 {
01660 for (int32_t i=num_tables-1; i>=0; i--)
01661 {
01662
01663 result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01664 max_idx=table[i]->get_element(max_idx).bt;
01665 }
01666 }
01667
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678 for (int32_t i=0; i<num_tables; i++)
01679 delete table[i];
01680
01681 delete[] table;
01682 return result;
01683 }
01684
01685
01686 float64_t* CWeightedDegreePositionStringKernel::extract_w(
01687 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01688 float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01689 {
01690 delete_optimization();
01691 use_poim_tries=true;
01692 poim_tries.delete_trees(false);
01693
01694
01695 ASSERT(position_weights_lhs==NULL);
01696 ASSERT(position_weights_rhs==NULL);
01697 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01698 ASSERT(num_feat>0);
01699 ASSERT(alphabet->get_alphabet()==DNA);
01700 ASSERT(max_degree>0);
01701
01702
01703 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01704 const int32_t seqLen = num_feat;
01705 float64_t** subs;
01706 int32_t i;
01707 int32_t k;
01708
01709
01710
01711
01712 int32_t* offsets;
01713 int32_t offset;
01714 offsets = new int32_t[ max_degree ];
01715 offset = 0;
01716 for( k = 0; k < max_degree; ++k ) {
01717 offsets[k] = offset;
01718 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01719 const int32_t tabSize = nofsKmers * seqLen;
01720 offset += tabSize;
01721 }
01722
01723 const int32_t bigTabSize = offset;
01724 w_result=new float64_t[bigTabSize];
01725 for (i=0; i<bigTabSize; ++i)
01726 w_result[i]=0;
01727
01728
01729 subs = new float64_t*[ max_degree ];
01730 ASSERT( subs != NULL );
01731 for( k = 0; k < max_degree; ++k ) {
01732 subs[k] = &w_result[ offsets[k] ];
01733 }
01734 delete[] offsets;
01735
01736
01737 init_optimization( num_suppvec, IDX, alphas, -1);
01738 poim_tries.POIMs_extract_W( subs, max_degree );
01739
01740
01741 delete[] subs;
01742 num_feat = 1;
01743 num_sym = bigTabSize;
01744 use_poim_tries=false;
01745 poim_tries.delete_trees(false);
01746 return w_result;
01747 }
01748
01749 float64_t* CWeightedDegreePositionStringKernel::compute_POIM(
01750 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01751 float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
01752 float64_t* alphas, float64_t* distrib )
01753 {
01754 delete_optimization();
01755 use_poim_tries=true;
01756 poim_tries.delete_trees(false);
01757
01758
01759 ASSERT(position_weights_lhs==NULL);
01760 ASSERT(position_weights_rhs==NULL);
01761 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01762 ASSERT(num_feat>0);
01763 ASSERT(alphabet->get_alphabet()==DNA);
01764 ASSERT(max_degree!=0);
01765 ASSERT(distrib);
01766
01767
01768 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01769 const int32_t seqLen = num_feat;
01770 float64_t** subs;
01771 int32_t i;
01772 int32_t k;
01773
01774
01775
01776
01777
01778
01779
01780
01781
01782
01783
01784
01785 const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01786 if( debug ) {
01787 max_degree = abs(max_degree) / 4;
01788 switch( debug ) {
01789 case 1: {
01790 printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01791 break;
01792 }
01793 case 2: {
01794 printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01795 break;
01796 }
01797 case 3: {
01798 printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01799 break;
01800 }
01801 case 4: {
01802 printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01803 break;
01804 }
01805 default: {
01806 printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01807 ASSERT(0);
01808 break;
01809 }
01810 }
01811 }
01812
01813
01814 int32_t* offsets;
01815 int32_t offset;
01816 offsets = new int32_t[ max_degree ];
01817 offset = 0;
01818 for( k = 0; k < max_degree; ++k ) {
01819 offsets[k] = offset;
01820 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01821 const int32_t tabSize = nofsKmers * seqLen;
01822 offset += tabSize;
01823 }
01824
01825 const int32_t bigTabSize=offset;
01826 poim_result=new float64_t[bigTabSize];
01827 for (i=0; i<bigTabSize; ++i )
01828 poim_result[i]=0;
01829
01830
01831 subs=new float64_t*[max_degree];
01832 for (k=0; k<max_degree; ++k)
01833 subs[k]=&poim_result[offsets[k]];
01834
01835 delete[] offsets;
01836
01837
01838 init_optimization( num_suppvec, IDX, alphas, -1);
01839 poim_tries.POIMs_precalc_SLR( distrib );
01840
01841
01842 if( debug==0 || debug==1 ) {
01843 poim_tries.POIMs_extract_W( subs, max_degree );
01844 for( k = 1; k < max_degree; ++k ) {
01845 const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) CMath::pow(NUM_SYMS,k-1) : 0;
01846 const int32_t nofKmers1 = (int32_t) CMath::pow( NUM_SYMS, k );
01847 const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
01848 for( i = 0; i < seqLen; ++i ) {
01849 float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01850 float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01851 float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01852 float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
01853 int32_t y0;
01854 for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01855 const int32_t y1l = y0 / NUM_SYMS;
01856 const int32_t y1r = y0 % nofKmers1;
01857 const int32_t y2 = y1r / NUM_SYMS;
01858 subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01859 if( i < seqLen-1 ) {
01860 subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01861 if( k > 1 ) {
01862 subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01863 }
01864 }
01865 }
01866 }
01867 }
01868 }
01869
01870
01871 poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01872
01873
01874 delete[] subs;
01875 num_feat = 1;
01876 num_sym = bigTabSize;
01877
01878 use_poim_tries=false;
01879 poim_tries.delete_trees(false);
01880
01881 return poim_result;
01882 }
01883
01884
01885 void CWeightedDegreePositionStringKernel::prepare_POIM2(
01886 float64_t* distrib, int32_t num_sym, int32_t num_feat)
01887 {
01888 free(m_poim_distrib);
01889 m_poim_distrib=(float64_t*)malloc(num_sym*num_feat*sizeof(float64_t));
01890 ASSERT(m_poim_distrib);
01891
01892 memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(float64_t));
01893 m_poim_num_sym=num_sym;
01894 m_poim_num_feat=num_feat;
01895 }
01896
01897 void CWeightedDegreePositionStringKernel::compute_POIM2(
01898 int32_t max_degree, CSVM* svm)
01899 {
01900 ASSERT(svm);
01901 int32_t num_suppvec=svm->get_num_support_vectors();
01902 int32_t* sv_idx=new int32_t[num_suppvec];
01903 float64_t* sv_weight=new float64_t[num_suppvec];
01904
01905 for (int32_t i=0; i<num_suppvec; i++)
01906 {
01907 sv_idx[i]=svm->get_support_vector(i);
01908 sv_weight[i]=svm->get_alpha(i);
01909 }
01910
01911 if ((max_degree < 1) || (max_degree > 12))
01912 {
01913
01914 SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01915 max_degree=1;
01916 }
01917
01918 int32_t num_feat = m_poim_num_feat;
01919 int32_t num_sym = m_poim_num_sym;
01920 free(m_poim);
01921
01922 m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
01923 sv_weight, m_poim_distrib);
01924
01925 ASSERT(num_feat==1);
01926 m_poim_result_len=num_sym;
01927
01928 delete[] sv_weight;
01929 delete[] sv_idx;
01930 }
01931
01932 void CWeightedDegreePositionStringKernel::get_POIM2(
01933 float64_t** poim, int32_t* result_len)
01934 {
01935 *poim=(float64_t*) malloc(m_poim_result_len*sizeof(float64_t));
01936 ASSERT(*poim);
01937 memcpy(*poim, m_poim, m_poim_result_len*sizeof(float64_t)) ;
01938 *result_len=m_poim_result_len ;
01939 }
01940
01941 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01942 {
01943 free(m_poim) ;
01944 m_poim=NULL ;
01945 free(m_poim_distrib) ;
01946 m_poim_distrib=NULL ;
01947 m_poim_num_sym=0 ;
01948 m_poim_num_sym=0 ;
01949 m_poim_result_len=0 ;
01950 }
01951