WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017 
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022 
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026 
00027 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00028 struct S_THREAD_PARAM
00029 {
00030     int32_t* vec;
00031     float64_t* result;
00032     float64_t* weights;
00033     CWeightedDegreeStringKernel* kernel;
00034     CTrie<DNATrie>* tries;
00035     float64_t factor;
00036     int32_t j;
00037     int32_t start;
00038     int32_t end;
00039     int32_t length;
00040     int32_t* vec_idx;
00041 };
00042 #endif // DOXYGEN_SHOULD_SKIP_THIS
00043 
00044 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00045     int32_t degree_, EWDKernType type_)
00046 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00047     weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00048     max_mismatch(0), seq_length(0), block_computation(true),
00049     num_block_weights_external(0), block_weights_external(NULL),
00050     block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00051     tree_initialized(false), alphabet(NULL)
00052 {
00053     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00054     lhs=NULL;
00055     rhs=NULL;
00056 
00057     if (type!=E_EXTERNAL)
00058         set_wd_weights_by_type(type);
00059 
00060     set_normalizer(new CFirstElementKernelNormalizer());
00061 }
00062 
00063 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00064     float64_t *weights_, int32_t degree_)
00065 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00066     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00067     max_mismatch(0), seq_length(0), block_computation(true),
00068     num_block_weights_external(0), block_weights_external(NULL),
00069     block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00070     tree_initialized(false), alphabet(NULL)
00071 {
00072     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00073     lhs=NULL;
00074     rhs=NULL;
00075 
00076     weights=new float64_t[degree*(1+max_mismatch)];
00077     for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00078         weights[i]=weights_[i];
00079     set_normalizer(new CFirstElementKernelNormalizer());
00080 }
00081 
00082 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00083     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00084 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00085     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00086     max_mismatch(0), seq_length(0), block_computation(true),
00087     num_block_weights_external(0), block_weights_external(NULL),
00088     block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00089     tree_initialized(false), alphabet(NULL)
00090 {
00091     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00092 
00093     set_wd_weights_by_type(type);
00094     set_normalizer(new CFirstElementKernelNormalizer());
00095     init(l, r);
00096 }
00097 
00098 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00099 {
00100     cleanup();
00101 
00102     delete[] weights;
00103     weights=NULL;
00104 
00105     delete[] block_weights;
00106     block_weights=NULL;
00107 
00108     delete[] position_weights;
00109     position_weights=NULL;
00110 
00111     delete[] weights_buffer;
00112     weights_buffer=NULL;
00113 }
00114 
00115 
00116 void CWeightedDegreeStringKernel::remove_lhs()
00117 { 
00118     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00119     delete_optimization();
00120 
00121     if (tries!=NULL)
00122         tries->destroy();
00123 
00124     CKernel::remove_lhs();
00125 }
00126 
00127 void CWeightedDegreeStringKernel::create_empty_tries()
00128 {
00129     seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00130 
00131     if (tries!=NULL)
00132     {
00133         tries->destroy() ;
00134         tries->create(seq_length, max_mismatch==0) ;
00135     }
00136 }
00137   
00138 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00139 {
00140     int32_t lhs_changed=(lhs!=l);
00141     int32_t rhs_changed=(rhs!=r);
00142 
00143     CStringKernel<char>::init(l,r);
00144 
00145     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00146     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00147 
00148     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00149     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00150 
00151     int32_t len=sf_l->get_max_vector_length();
00152     if (lhs_changed && !sf_l->have_same_length(len))
00153         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00154 
00155     if (rhs_changed && !sf_r->have_same_length(len))
00156         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00157 
00158     SG_UNREF(alphabet);
00159     alphabet=sf_l->get_alphabet();
00160     CAlphabet* ralphabet=sf_r->get_alphabet();
00161     
00162     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00163         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00164 
00165     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00166     SG_UNREF(ralphabet);
00167 
00168     if (tries!=NULL) {
00169         tries->delete_trees(max_mismatch==0);
00170         SG_UNREF(tries);
00171     }
00172     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00173     create_empty_tries();
00174 
00175     init_block_weights();
00176 
00177     return init_normalizer();
00178 }
00179 
00180 void CWeightedDegreeStringKernel::cleanup()
00181 {
00182     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00183     delete_optimization();
00184 
00185     delete[] block_weights;
00186     block_weights=NULL;
00187 
00188     if (tries!=NULL)
00189     {
00190         tries->destroy();
00191         SG_UNREF(tries);
00192         tries=NULL;
00193     }
00194 
00195     seq_length=0;
00196     tree_initialized = false;
00197 
00198     SG_UNREF(alphabet);
00199     alphabet=NULL;
00200 
00201     CKernel::cleanup();
00202 }
00203 
00204 bool CWeightedDegreeStringKernel::load_init(FILE* src)
00205 {
00206     return false;
00207 }
00208 
00209 bool CWeightedDegreeStringKernel::save_init(FILE* dest)
00210 {
00211     return false;
00212 }
00213   
00214 
00215 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00216 {
00217     if (tree_num<0)
00218         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00219 
00220     delete_optimization();
00221 
00222     if (tree_num<0)
00223         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00224 
00225     for (int32_t i=0; i<count; i++)
00226     {
00227         if (tree_num<0)
00228         {
00229             if ( (i % (count/10+1)) == 0)
00230                 SG_PROGRESS(i, 0, count);
00231             
00232             if (max_mismatch==0)
00233                 add_example_to_tree(IDX[i], alphas[i]) ;
00234             else
00235                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00236 
00237             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00238         }
00239         else
00240         {
00241             if (max_mismatch==0)
00242                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00243             else
00244                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00245         }
00246     }
00247     
00248     if (tree_num<0)
00249         SG_DONE();
00250 
00251     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00252 
00253     set_is_initialized(true) ;
00254     return true ;
00255 }
00256 
00257 bool CWeightedDegreeStringKernel::delete_optimization()
00258 { 
00259     if (get_is_initialized())
00260     {
00261         if (tries!=NULL)
00262             tries->delete_trees(max_mismatch==0); 
00263         set_is_initialized(false);
00264         return true;
00265     }
00266     
00267     return false;
00268 }
00269 
00270 
00271 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00272     char* avec, int32_t alen, char* bvec, int32_t blen)
00273 {
00274     float64_t sum = 0.0;
00275 
00276     for (int32_t i=0; i<alen; i++)
00277     {
00278         float64_t sumi = 0.0;
00279         int32_t mismatches=0;
00280 
00281         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00282         {
00283             if (avec[i+j]!=bvec[i+j])
00284             {
00285                 mismatches++ ;
00286                 if (mismatches>max_mismatch)
00287                     break ;
00288             } ;
00289             sumi += weights[j+degree*mismatches];
00290         }
00291         if (position_weights!=NULL)
00292             sum+=position_weights[i]*sumi ;
00293         else
00294             sum+=sumi ;
00295     }
00296     return sum ;
00297 }
00298 
00299 float64_t CWeightedDegreeStringKernel::compute_using_block(
00300     char* avec, int32_t alen, char* bvec, int32_t blen)
00301 {
00302     ASSERT(alen==blen);
00303 
00304     float64_t sum=0;
00305     int32_t match_len=-1;
00306 
00307     for (int32_t i=0; i<alen; i++)
00308     {
00309         if (avec[i]==bvec[i])
00310             match_len++;
00311         else
00312         {
00313             if (match_len>=0)
00314                 sum+=block_weights[match_len];
00315             match_len=-1;
00316         }
00317     }
00318 
00319     if (match_len>=0)
00320         sum+=block_weights[match_len];
00321 
00322     return sum;
00323 }
00324 
00325 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00326     char* avec, int32_t alen, char* bvec, int32_t blen)
00327 {
00328     float64_t sum = 0.0;
00329 
00330     for (int32_t i=0; i<alen; i++)
00331     {
00332         float64_t sumi = 0.0;
00333 
00334         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00335         {
00336             if (avec[i+j]!=bvec[i+j])
00337                 break ;
00338             sumi += weights[j];
00339         }
00340         if (position_weights!=NULL)
00341             sum+=position_weights[i]*sumi ;
00342         else
00343             sum+=sumi ;
00344     }
00345     return sum ;
00346 }
00347 
00348 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00349     char* avec, int32_t alen, char* bvec, int32_t blen)
00350 {
00351     float64_t sum = 0.0;
00352 
00353     for (int32_t i=0; i<alen; i++)
00354     {
00355         float64_t sumi=0.0;
00356         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00357         {
00358             if (avec[i+j]!=bvec[i+j])
00359                 break;
00360             sumi += weights[i*degree+j];
00361         }
00362         if (position_weights!=NULL)
00363             sum += position_weights[i]*sumi ;
00364         else
00365             sum += sumi ;
00366     }
00367 
00368     return sum ;
00369 }
00370 
00371 
00372 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00373 {
00374     int32_t alen, blen;
00375     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00376     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00377     float64_t result=0;
00378 
00379     if (max_mismatch==0 && length==0 && block_computation)
00380         result=compute_using_block(avec, alen, bvec, blen);
00381     else
00382     {
00383         if (max_mismatch>0)
00384             result=compute_with_mismatch(avec, alen, bvec, blen);
00385         else if (length==0)
00386             result=compute_without_mismatch(avec, alen, bvec, blen);
00387         else
00388             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00389     }
00390 
00391     return result;
00392 }
00393 
00394 
00395 void CWeightedDegreeStringKernel::add_example_to_tree(
00396     int32_t idx, float64_t alpha)
00397 {
00398     ASSERT(alphabet);
00399     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00400 
00401     int32_t len=0;
00402     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00403     ASSERT(max_mismatch==0);
00404     int32_t *vec=new int32_t[len];
00405 
00406     for (int32_t i=0; i<len; i++)
00407         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00408     
00409     if (length == 0 || max_mismatch > 0)
00410     {
00411         for (int32_t i=0; i<len; i++)
00412         {
00413             float64_t alpha_pw=alpha;
00414             /*if (position_weights!=NULL)
00415               alpha_pw *= position_weights[i] ;*/
00416             if (alpha_pw==0.0)
00417                 continue;
00418             ASSERT(tries);
00419             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00420         }
00421     }
00422     else
00423     {
00424         for (int32_t i=0; i<len; i++)
00425         {
00426             float64_t alpha_pw=alpha;
00427             /*if (position_weights!=NULL) 
00428               alpha_pw = alpha*position_weights[i] ;*/
00429             if (alpha_pw==0.0)
00430                 continue ;
00431             ASSERT(tries);
00432             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00433         }
00434     }
00435     delete[] vec ;
00436     tree_initialized=true ;
00437 }
00438 
00439 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00440     int32_t idx, float64_t alpha, int32_t tree_num) 
00441 {
00442     ASSERT(alphabet);
00443     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00444 
00445     int32_t len ;
00446     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00447     ASSERT(max_mismatch==0);
00448     int32_t *vec = new int32_t[len] ;
00449 
00450     for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00451         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00452     
00453     ASSERT(tries);
00454     if (alpha!=0.0)
00455         tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00456 
00457     delete[] vec ;
00458     tree_initialized=true ;
00459 }
00460 
00461 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00462 {
00463     ASSERT(tries);
00464     ASSERT(alphabet);
00465     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00466 
00467     int32_t len ;
00468     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00469     
00470     int32_t *vec = new int32_t[len] ;
00471     
00472     for (int32_t i=0; i<len; i++)
00473         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00474     
00475     for (int32_t i=0; i<len; i++)
00476     {
00477         if (alpha!=0.0)
00478             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00479     }
00480     
00481     delete[] vec ;
00482     tree_initialized=true ;
00483 }
00484 
00485 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00486     int32_t idx, float64_t alpha, int32_t tree_num)
00487 {
00488     ASSERT(tries);
00489     ASSERT(alphabet);
00490     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00491 
00492     int32_t len=0;
00493     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00494     int32_t *vec=new int32_t[len];
00495 
00496     for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00497         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00498 
00499     if (alpha!=0.0)
00500     {
00501         tries->add_example_to_tree_mismatch_recursion(
00502             NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00503             0, 0, max_mismatch, weights);
00504     }
00505 
00506     delete[] vec;
00507     tree_initialized=true;
00508 }
00509 
00510 
00511 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00512 {
00513     ASSERT(alphabet);
00514     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00515 
00516     int32_t len=0;
00517     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00518     ASSERT(char_vec && len>0);
00519     int32_t *vec=new int32_t[len];
00520 
00521     for (int32_t i=0; i<len; i++)
00522         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00523 
00524     float64_t sum=0;
00525     ASSERT(tries);
00526     for (int32_t i=0; i<len; i++)
00527         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00528 
00529     delete[] vec;
00530     return normalizer->normalize_rhs(sum, idx);
00531 }
00532 
00533 void CWeightedDegreeStringKernel::compute_by_tree(
00534     int32_t idx, float64_t* LevelContrib)
00535 {
00536     ASSERT(alphabet);
00537     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00538 
00539     int32_t len ;
00540     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00541 
00542     int32_t *vec = new int32_t[len] ;
00543 
00544     for (int32_t i=0; i<len; i++)
00545         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00546 
00547     ASSERT(tries);
00548     for (int32_t i=0; i<len; i++)
00549     {
00550         tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00551                 normalizer->normalize_rhs(1.0, idx),
00552                 mkl_stepsize, weights, (length!=0));
00553     }
00554 
00555     delete[] vec ;
00556 }
00557 
00558 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00559 {
00560     ASSERT(tries);
00561     return tries->compute_abs_weights(len);
00562 }
00563 
00564 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00565 {
00566     ASSERT(degree>0);
00567     ASSERT(p_type==E_WD); 
00568 
00569     delete[] weights;
00570     weights=new float64_t[degree];
00571     if (weights)
00572     {
00573         int32_t i;
00574         float64_t sum=0;
00575         for (i=0; i<degree; i++)
00576         {
00577             weights[i]=degree-i;
00578             sum+=weights[i];
00579         }
00580         for (i=0; i<degree; i++)
00581             weights[i]/=sum;
00582 
00583         for (i=0; i<degree; i++)
00584         {
00585             for (int32_t j=1; j<=max_mismatch; j++)
00586             {
00587                 if (j<i+1)
00588                 {
00589                     int32_t nk=CMath::nchoosek(i+1, j);
00590                     weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00591                 }
00592                 else
00593                     weights[i+j*degree]= 0;
00594             }
00595         }
00596 
00597         if (which_degree>=0)
00598         {
00599             ASSERT(which_degree<degree);
00600             for (i=0; i<degree; i++)
00601             {
00602                 if (i!=which_degree)
00603                     weights[i]=0;
00604                 else
00605                     weights[i]=1;
00606             }
00607         }
00608         return true;
00609     }
00610     else
00611         return false;
00612 }
00613 
00614 bool CWeightedDegreeStringKernel::set_weights(
00615     float64_t* ws, int32_t d, int32_t len)
00616 {
00617     SG_DEBUG("degree = %i  d=%i\n", degree, d);
00618     degree=d;
00619     ASSERT(tries);
00620     tries->set_degree(degree);
00621     length=len;
00622     
00623     if (length==0) length=1;
00624     int32_t num_weights=degree*(length+max_mismatch);
00625     delete[] weights;
00626     weights=new float64_t[num_weights];
00627     if (weights)
00628     {
00629         for (int32_t i=0; i<num_weights; i++) {
00630             if (ws[i]) // len(ws) might be != num_weights?
00631                 weights[i]=ws[i];
00632         }
00633         return true;
00634     }
00635     else
00636         return false;
00637 }
00638 
00639 bool CWeightedDegreeStringKernel::set_position_weights(
00640     float64_t* pws, int32_t len)
00641 {
00642     if (len==0)
00643     {
00644         delete[] position_weights;
00645         position_weights=NULL;
00646         ASSERT(tries);
00647         tries->set_position_weights(position_weights);
00648     }
00649 
00650     if (seq_length!=len)
00651         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00652 
00653     delete[] position_weights;
00654     position_weights=new float64_t[len];
00655     ASSERT(tries);
00656     tries->set_position_weights(position_weights);
00657     
00658     if (position_weights)
00659     {
00660         for (int32_t i=0; i<len; i++)
00661             position_weights[i]=pws[i];
00662         return true;
00663     }
00664     else
00665         return false;
00666 }
00667 
00668 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00669 {
00670     delete[] block_weights;
00671     block_weights=new float64_t[CMath::max(seq_length,degree)];
00672 
00673     if (block_weights)
00674     {
00675         int32_t k;
00676         float64_t d=degree; // use float to evade rounding errors below
00677 
00678         for (k=0; k<degree; k++)
00679             block_weights[k]=
00680                 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00681         for (k=degree; k<seq_length; k++)
00682             block_weights[k]=(-d+3*k+4)/3;
00683     }
00684 
00685     return (block_weights!=NULL);
00686 }
00687 
00688 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00689 {
00690     ASSERT(weights);
00691     delete[] block_weights;
00692     block_weights=new float64_t[CMath::max(seq_length,degree)];
00693 
00694     if (block_weights)
00695     {
00696         int32_t i=0;
00697         block_weights[0]=weights[0];
00698         for (i=1; i<CMath::max(seq_length,degree); i++)
00699             block_weights[i]=0;
00700 
00701         for (i=1; i<CMath::max(seq_length,degree); i++)
00702         {
00703             block_weights[i]=block_weights[i-1];
00704 
00705             float64_t contrib=0;
00706             for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00707                 contrib+=weights[j];
00708 
00709             block_weights[i]+=contrib;
00710         }
00711     }
00712 
00713     return (block_weights!=NULL);
00714 }
00715 
00716 bool CWeightedDegreeStringKernel::init_block_weights_const()
00717 {
00718     delete[] block_weights;
00719     block_weights=new float64_t[seq_length];
00720 
00721     if (block_weights)
00722     {
00723         for (int32_t i=1; i<seq_length+1 ; i++)
00724             block_weights[i-1]=1.0/seq_length;
00725     }
00726 
00727     return (block_weights!=NULL);
00728 }
00729 
00730 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00731 {
00732     delete[] block_weights;
00733     block_weights=new float64_t[seq_length];
00734 
00735     if (block_weights)
00736     {
00737         for (int32_t i=1; i<seq_length+1 ; i++)
00738             block_weights[i-1]=degree*i;
00739     }
00740 
00741     return (block_weights!=NULL);
00742 }
00743 
00744 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00745 {
00746     delete[] block_weights;
00747     block_weights=new float64_t[seq_length];
00748 
00749     if (block_weights)
00750     {
00751         for (int32_t i=1; i<degree+1 ; i++)
00752             block_weights[i-1]=((float64_t) i)*i;
00753 
00754         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00755             block_weights[i-1]=i;
00756     }
00757 
00758     return (block_weights!=NULL);
00759 }
00760 
00761 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00762 {
00763     delete[] block_weights;
00764     block_weights=new float64_t[seq_length];
00765 
00766     if (block_weights)
00767     {
00768         for (int32_t i=1; i<degree+1 ; i++)
00769             block_weights[i-1]=((float64_t) i)*i*i;
00770 
00771         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00772             block_weights[i-1]=i;
00773     }
00774 
00775     return (block_weights!=NULL);
00776 }
00777 
00778 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00779 {
00780     delete[] block_weights;
00781     block_weights=new float64_t[seq_length];
00782 
00783     if (block_weights)
00784     {
00785         for (int32_t i=1; i<degree+1 ; i++)
00786             block_weights[i-1]=exp(((float64_t) i/10.0));
00787 
00788         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00789             block_weights[i-1]=i;
00790     }
00791 
00792     return (block_weights!=NULL);
00793 }
00794 
00795 bool CWeightedDegreeStringKernel::init_block_weights_log()
00796 {
00797     delete[] block_weights;
00798     block_weights=new float64_t[seq_length];
00799 
00800     if (block_weights)
00801     {
00802         for (int32_t i=1; i<degree+1 ; i++)
00803             block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00804 
00805         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00806             block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00807     }
00808 
00809     return (block_weights!=NULL);
00810 }
00811 
00812 bool CWeightedDegreeStringKernel::init_block_weights_external()
00813 {
00814     if (block_weights_external && (seq_length == num_block_weights_external) )
00815     {
00816         delete[] block_weights;
00817         block_weights=new float64_t[seq_length];
00818 
00819         if (block_weights)
00820         {
00821             for (int32_t i=0; i<seq_length; i++)
00822                 block_weights[i]=block_weights_external[i];
00823         }
00824     }
00825     else {
00826       SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00827    }
00828     return (block_weights!=NULL);
00829 }
00830 
00831 bool CWeightedDegreeStringKernel::init_block_weights()
00832 {
00833     switch (type)
00834     {
00835         case E_WD:
00836             return init_block_weights_from_wd();
00837         case E_EXTERNAL:
00838             return init_block_weights_from_wd_external();
00839         case E_BLOCK_CONST:
00840             return init_block_weights_const();
00841         case E_BLOCK_LINEAR:
00842             return init_block_weights_linear();
00843         case E_BLOCK_SQPOLY:
00844             return init_block_weights_sqpoly();
00845         case E_BLOCK_CUBICPOLY:
00846             return init_block_weights_cubicpoly();
00847         case E_BLOCK_EXP:
00848             return init_block_weights_exp();
00849         case E_BLOCK_LOG:
00850             return init_block_weights_log();
00851         case E_BLOCK_EXTERNAL:
00852             return init_block_weights_external();
00853         default:
00854             return false;
00855     };
00856 }
00857 
00858 
00859 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00860 {
00861     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00862     int32_t j=params->j;
00863     CWeightedDegreeStringKernel* wd=params->kernel;
00864     CTrie<DNATrie>* tries=params->tries;
00865     float64_t* weights=params->weights;
00866     int32_t length=params->length;
00867     int32_t* vec=params->vec;
00868     float64_t* result=params->result;
00869     float64_t factor=params->factor;
00870     int32_t* vec_idx=params->vec_idx;
00871 
00872     CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00873     CAlphabet* alpha=wd->alphabet;
00874 
00875     for (int32_t i=params->start; i<params->end; i++)
00876     {
00877         int32_t len=0;
00878         char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
00879         for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00880             vec[k]=alpha->remap_to_bin(char_vec[k]);
00881 
00882         ASSERT(tries);
00883 
00884         result[i]+=factor*
00885             wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00886     }
00887 
00888     SG_UNREF(rhs_feat);
00889 
00890     return NULL;
00891 }
00892 
00893 void CWeightedDegreeStringKernel::compute_batch(
00894     int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00895     int32_t* IDX, float64_t* alphas, float64_t factor)
00896 {
00897     ASSERT(tries);
00898     ASSERT(alphabet);
00899     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00900     ASSERT(rhs);
00901     ASSERT(num_vec<=rhs->get_num_vectors());
00902     ASSERT(num_vec>0);
00903     ASSERT(vec_idx);
00904     ASSERT(result);
00905     create_empty_tries();
00906 
00907     int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00908     ASSERT(num_feat>0);
00909     int32_t num_threads=parallel->get_num_threads();
00910     ASSERT(num_threads>0);
00911     int32_t* vec=new int32_t[num_threads*num_feat];
00912 
00913     if (num_threads < 2)
00914     {
00915 #ifdef CYGWIN
00916         for (int32_t j=0; j<num_feat; j++)
00917 #else
00918         CSignal::clear_cancel();
00919         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00920 #endif
00921         {
00922             init_optimization(num_suppvec, IDX, alphas, j);
00923             S_THREAD_PARAM params;
00924             params.vec=vec;
00925             params.result=result;
00926             params.weights=weights;
00927             params.kernel=this;
00928             params.tries=tries;
00929             params.factor=factor;
00930             params.j=j;
00931             params.start=0;
00932             params.end=num_vec;
00933             params.length=length;
00934             params.vec_idx=vec_idx;
00935             compute_batch_helper((void*) &params);
00936 
00937             SG_PROGRESS(j,0,num_feat);
00938         }
00939     }
00940 #ifndef WIN32
00941     else
00942     {
00943         CSignal::clear_cancel();
00944         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00945         {
00946             init_optimization(num_suppvec, IDX, alphas, j);
00947             pthread_t* threads = new pthread_t[num_threads-1];
00948             S_THREAD_PARAM* params = new S_THREAD_PARAM[num_threads];
00949             int32_t step= num_vec/num_threads;
00950             int32_t t;
00951 
00952             for (t=0; t<num_threads-1; t++)
00953             {
00954                 params[t].vec=&vec[num_feat*t];
00955                 params[t].result=result;
00956                 params[t].weights=weights;
00957                 params[t].kernel=this;
00958                 params[t].tries=tries;
00959                 params[t].factor=factor;
00960                 params[t].j=j;
00961                 params[t].start = t*step;
00962                 params[t].end = (t+1)*step;
00963                 params[t].length=length;
00964                 params[t].vec_idx=vec_idx;
00965                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00966             }
00967             params[t].vec=&vec[num_feat*t];
00968             params[t].result=result;
00969             params[t].weights=weights;
00970             params[t].kernel=this;
00971             params[t].tries=tries;
00972             params[t].factor=factor;
00973             params[t].j=j;
00974             params[t].start=t*step;
00975             params[t].end=num_vec;
00976             params[t].length=length;
00977             params[t].vec_idx=vec_idx;
00978             compute_batch_helper((void*) &params[t]);
00979 
00980             for (t=0; t<num_threads-1; t++)
00981                 pthread_join(threads[t], NULL);
00982             SG_PROGRESS(j,0,num_feat);
00983 
00984             delete[] params;
00985             delete[] threads;
00986         }
00987     }
00988 #endif
00989 
00990     delete[] vec;
00991 
00992     //really also free memory as this can be huge on testing especially when
00993     //using the combined kernel
00994     create_empty_tries();
00995 }
00996 
00997 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00998 {
00999     if (type==E_EXTERNAL && max!=0) {
01000         return false;
01001     }
01002 
01003     max_mismatch=max;
01004 
01005     if (lhs!=NULL && rhs!=NULL)
01006         return init(lhs, rhs);
01007     else
01008         return true;
01009 }

SHOGUN Machine Learning Toolbox - Documentation