WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017 
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022 
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026 
00027 
00028 #ifdef HAVE_BOOST_SERIALIZATION
00029 #include <boost/serialization/export.hpp>
00030 BOOST_CLASS_EXPORT(shogun::CWeightedDegreeStringKernel);
00031 #endif //HAVE_BOOST_SERIALIZATION
00032 
00033 
00034 using namespace shogun;
00035 
00036 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00037 struct S_THREAD_PARAM
00038 {
00039 
00040     int32_t* vec;
00041     float64_t* result;
00042     float64_t* weights;
00043     CWeightedDegreeStringKernel* kernel;
00044     CTrie<DNATrie>* tries;
00045     float64_t factor;
00046     int32_t j;
00047     int32_t start;
00048     int32_t end;
00049     int32_t length;
00050     int32_t* vec_idx;
00051 };
00052 #endif // DOXYGEN_SHOULD_SKIP_THIS
00053 
00054 
00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00056     int32_t degree_, EWDKernType type_)
00057 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00058     weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00059     max_mismatch(0), seq_length(0), block_computation(true),
00060     num_block_weights_external(0), block_weights_external(NULL),
00061     block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00062     tree_initialized(false), alphabet(NULL)
00063 {
00064     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00065     lhs=NULL;
00066     rhs=NULL;
00067 
00068     if (type!=E_EXTERNAL)
00069         set_wd_weights_by_type(type);
00070 
00071     set_normalizer(new CFirstElementKernelNormalizer());
00072 }
00073 
00074 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00075     float64_t *weights_, int32_t degree_)
00076 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00077     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00078     max_mismatch(0), seq_length(0), block_computation(true),
00079     num_block_weights_external(0), block_weights_external(NULL),
00080     block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00081     tree_initialized(false), alphabet(NULL)
00082 {
00083     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00084     lhs=NULL;
00085     rhs=NULL;
00086 
00087     weights=new float64_t[degree*(1+max_mismatch)];
00088     for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00089         weights[i]=weights_[i];
00090     set_normalizer(new CFirstElementKernelNormalizer());
00091 }
00092 
00093 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00094     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00095 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00096     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00097     max_mismatch(0), seq_length(0), block_computation(true),
00098     num_block_weights_external(0), block_weights_external(NULL),
00099     block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00100     tree_initialized(false), alphabet(NULL)
00101 {
00102     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00103 
00104     set_wd_weights_by_type(type);
00105     set_normalizer(new CFirstElementKernelNormalizer());
00106     init(l, r);
00107 }
00108 
00109 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00110 {
00111     cleanup();
00112 
00113     delete[] weights;
00114     weights=NULL;
00115 
00116     delete[] block_weights;
00117     block_weights=NULL;
00118 
00119     delete[] position_weights;
00120     position_weights=NULL;
00121 
00122     delete[] weights_buffer;
00123     weights_buffer=NULL;
00124 }
00125 
00126 
00127 void CWeightedDegreeStringKernel::remove_lhs()
00128 {
00129     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00130     delete_optimization();
00131 
00132     if (tries!=NULL)
00133         tries->destroy();
00134 
00135     CKernel::remove_lhs();
00136 }
00137 
00138 void CWeightedDegreeStringKernel::create_empty_tries()
00139 {
00140     ASSERT(lhs);
00141 
00142     seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00143 
00144     if (tries!=NULL)
00145     {
00146         tries->destroy() ;
00147         tries->create(seq_length, max_mismatch==0) ;
00148     }
00149 }
00150 
00151 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00152 {
00153     int32_t lhs_changed=(lhs!=l);
00154     int32_t rhs_changed=(rhs!=r);
00155 
00156     CStringKernel<char>::init(l,r);
00157 
00158     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00159     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00160 
00161     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00162     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00163 
00164     int32_t len=sf_l->get_max_vector_length();
00165     if (lhs_changed && !sf_l->have_same_length(len))
00166         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00167 
00168     if (rhs_changed && !sf_r->have_same_length(len))
00169         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00170 
00171     SG_UNREF(alphabet);
00172     alphabet=sf_l->get_alphabet();
00173     CAlphabet* ralphabet=sf_r->get_alphabet();
00174 
00175     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00176         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00177 
00178     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00179     SG_UNREF(ralphabet);
00180 
00181     if (tries!=NULL) {
00182         tries->delete_trees(max_mismatch==0);
00183         SG_UNREF(tries);
00184     }
00185     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00186     create_empty_tries();
00187 
00188     init_block_weights();
00189 
00190     return init_normalizer();
00191 }
00192 
00193 void CWeightedDegreeStringKernel::cleanup()
00194 {
00195     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00196     delete_optimization();
00197 
00198     delete[] block_weights;
00199     block_weights=NULL;
00200 
00201     if (tries!=NULL)
00202     {
00203         tries->destroy();
00204         SG_UNREF(tries);
00205         tries=NULL;
00206     }
00207 
00208     seq_length=0;
00209     tree_initialized = false;
00210 
00211     SG_UNREF(alphabet);
00212     alphabet=NULL;
00213 
00214     CKernel::cleanup();
00215 }
00216 
00217 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00218 {
00219     if (tree_num<0)
00220         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00221 
00222     delete_optimization();
00223 
00224     if (tree_num<0)
00225         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00226 
00227     for (int32_t i=0; i<count; i++)
00228     {
00229         if (tree_num<0)
00230         {
00231             if ( (i % (count/10+1)) == 0)
00232                 SG_PROGRESS(i, 0, count);
00233 
00234             if (max_mismatch==0)
00235                 add_example_to_tree(IDX[i], alphas[i]) ;
00236             else
00237                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00238 
00239             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00240         }
00241         else
00242         {
00243             if (max_mismatch==0)
00244                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00245             else
00246                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00247         }
00248     }
00249 
00250     if (tree_num<0)
00251         SG_DONE();
00252 
00253     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00254 
00255     set_is_initialized(true) ;
00256     return true ;
00257 }
00258 
00259 bool CWeightedDegreeStringKernel::delete_optimization()
00260 {
00261     if (get_is_initialized())
00262     {
00263         if (tries!=NULL)
00264             tries->delete_trees(max_mismatch==0);
00265         set_is_initialized(false);
00266         return true;
00267     }
00268 
00269     return false;
00270 }
00271 
00272 
00273 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00274     char* avec, int32_t alen, char* bvec, int32_t blen)
00275 {
00276     float64_t sum = 0.0;
00277 
00278     for (int32_t i=0; i<alen; i++)
00279     {
00280         float64_t sumi = 0.0;
00281         int32_t mismatches=0;
00282 
00283         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00284         {
00285             if (avec[i+j]!=bvec[i+j])
00286             {
00287                 mismatches++ ;
00288                 if (mismatches>max_mismatch)
00289                     break ;
00290             } ;
00291             sumi += weights[j+degree*mismatches];
00292         }
00293         if (position_weights!=NULL)
00294             sum+=position_weights[i]*sumi ;
00295         else
00296             sum+=sumi ;
00297     }
00298     return sum ;
00299 }
00300 
00301 float64_t CWeightedDegreeStringKernel::compute_using_block(
00302     char* avec, int32_t alen, char* bvec, int32_t blen)
00303 {
00304     ASSERT(alen==blen);
00305 
00306     float64_t sum=0;
00307     int32_t match_len=-1;
00308 
00309     for (int32_t i=0; i<alen; i++)
00310     {
00311         if (avec[i]==bvec[i])
00312             match_len++;
00313         else
00314         {
00315             if (match_len>=0)
00316                 sum+=block_weights[match_len];
00317             match_len=-1;
00318         }
00319     }
00320 
00321     if (match_len>=0)
00322         sum+=block_weights[match_len];
00323 
00324     return sum;
00325 }
00326 
00327 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00328     char* avec, int32_t alen, char* bvec, int32_t blen)
00329 {
00330     float64_t sum = 0.0;
00331 
00332     for (int32_t i=0; i<alen; i++)
00333     {
00334         float64_t sumi = 0.0;
00335 
00336         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00337         {
00338             if (avec[i+j]!=bvec[i+j])
00339                 break ;
00340             sumi += weights[j];
00341         }
00342         if (position_weights!=NULL)
00343             sum+=position_weights[i]*sumi ;
00344         else
00345             sum+=sumi ;
00346     }
00347     return sum ;
00348 }
00349 
00350 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00351     char* avec, int32_t alen, char* bvec, int32_t blen)
00352 {
00353     float64_t sum = 0.0;
00354 
00355     for (int32_t i=0; i<alen; i++)
00356     {
00357         float64_t sumi=0.0;
00358         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00359         {
00360             if (avec[i+j]!=bvec[i+j])
00361                 break;
00362             sumi += weights[i*degree+j];
00363         }
00364         if (position_weights!=NULL)
00365             sum += position_weights[i]*sumi ;
00366         else
00367             sum += sumi ;
00368     }
00369 
00370     return sum ;
00371 }
00372 
00373 
00374 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00375 {
00376     int32_t alen, blen;
00377     bool free_avec, free_bvec;
00378     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00379     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00380     float64_t result=0;
00381 
00382     if (max_mismatch==0 && length==0 && block_computation)
00383         result=compute_using_block(avec, alen, bvec, blen);
00384     else
00385     {
00386         if (max_mismatch>0)
00387             result=compute_with_mismatch(avec, alen, bvec, blen);
00388         else if (length==0)
00389             result=compute_without_mismatch(avec, alen, bvec, blen);
00390         else
00391             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00392     }
00393     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00394     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00395 
00396     return result;
00397 }
00398 
00399 
00400 void CWeightedDegreeStringKernel::add_example_to_tree(
00401     int32_t idx, float64_t alpha)
00402 {
00403     ASSERT(alphabet);
00404     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00405 
00406     int32_t len=0;
00407     bool free_vec;
00408     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00409     ASSERT(max_mismatch==0);
00410     int32_t *vec=new int32_t[len];
00411 
00412     for (int32_t i=0; i<len; i++)
00413         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00414     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00415 
00416     if (length == 0 || max_mismatch > 0)
00417     {
00418         for (int32_t i=0; i<len; i++)
00419         {
00420             float64_t alpha_pw=alpha;
00421             /*if (position_weights!=NULL)
00422               alpha_pw *= position_weights[i] ;*/
00423             if (alpha_pw==0.0)
00424                 continue;
00425             ASSERT(tries);
00426             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00427         }
00428     }
00429     else
00430     {
00431         for (int32_t i=0; i<len; i++)
00432         {
00433             float64_t alpha_pw=alpha;
00434             /*if (position_weights!=NULL)
00435               alpha_pw = alpha*position_weights[i] ;*/
00436             if (alpha_pw==0.0)
00437                 continue ;
00438             ASSERT(tries);
00439             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00440         }
00441     }
00442     delete[] vec ;
00443     tree_initialized=true ;
00444 }
00445 
00446 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00447     int32_t idx, float64_t alpha, int32_t tree_num)
00448 {
00449     ASSERT(alphabet);
00450     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00451 
00452     int32_t len;
00453     bool free_vec;
00454     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00455     ASSERT(max_mismatch==0);
00456     int32_t *vec = new int32_t[len] ;
00457 
00458     for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00459         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00460     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00461 
00462 
00463     ASSERT(tries);
00464     if (alpha!=0.0)
00465         tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00466 
00467     delete[] vec ;
00468     tree_initialized=true ;
00469 }
00470 
00471 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00472 {
00473     ASSERT(tries);
00474     ASSERT(alphabet);
00475     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00476 
00477     int32_t len ;
00478     bool free_vec;
00479     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00480 
00481     int32_t *vec = new int32_t[len] ;
00482 
00483     for (int32_t i=0; i<len; i++)
00484         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00485     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00486 
00487     for (int32_t i=0; i<len; i++)
00488     {
00489         if (alpha!=0.0)
00490             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00491     }
00492 
00493     delete[] vec ;
00494     tree_initialized=true ;
00495 }
00496 
00497 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00498     int32_t idx, float64_t alpha, int32_t tree_num)
00499 {
00500     ASSERT(tries);
00501     ASSERT(alphabet);
00502     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00503 
00504     int32_t len=0;
00505     bool free_vec;
00506     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00507     int32_t *vec=new int32_t[len];
00508 
00509     for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00510         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00511     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00512 
00513     if (alpha!=0.0)
00514     {
00515         tries->add_example_to_tree_mismatch_recursion(
00516             NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00517             0, 0, max_mismatch, weights);
00518     }
00519 
00520     delete[] vec;
00521     tree_initialized=true;
00522 }
00523 
00524 
00525 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00526 {
00527     ASSERT(alphabet);
00528     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00529 
00530     int32_t len=0;
00531     bool free_vec;
00532     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00533     ASSERT(char_vec && len>0);
00534     int32_t *vec=new int32_t[len];
00535 
00536     for (int32_t i=0; i<len; i++)
00537         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00538     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00539 
00540     float64_t sum=0;
00541     ASSERT(tries);
00542     for (int32_t i=0; i<len; i++)
00543         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00544 
00545     delete[] vec;
00546     return normalizer->normalize_rhs(sum, idx);
00547 }
00548 
00549 void CWeightedDegreeStringKernel::compute_by_tree(
00550     int32_t idx, float64_t* LevelContrib)
00551 {
00552     ASSERT(alphabet);
00553     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00554 
00555     int32_t len ;
00556     bool free_vec;
00557     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00558 
00559     int32_t *vec = new int32_t[len] ;
00560 
00561     for (int32_t i=0; i<len; i++)
00562         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00563     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00564 
00565     ASSERT(tries);
00566     for (int32_t i=0; i<len; i++)
00567     {
00568         tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00569                 normalizer->normalize_rhs(1.0, idx),
00570                 mkl_stepsize, weights, (length!=0));
00571     }
00572 
00573     delete[] vec ;
00574 }
00575 
00576 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00577 {
00578     ASSERT(tries);
00579     return tries->compute_abs_weights(len);
00580 }
00581 
00582 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00583 {
00584     ASSERT(degree>0);
00585     ASSERT(p_type==E_WD); 
00586 
00587     delete[] weights;
00588     weights=new float64_t[degree];
00589     if (weights)
00590     {
00591         int32_t i;
00592         float64_t sum=0;
00593         for (i=0; i<degree; i++)
00594         {
00595             weights[i]=degree-i;
00596             sum+=weights[i];
00597         }
00598         for (i=0; i<degree; i++)
00599             weights[i]/=sum;
00600 
00601         for (i=0; i<degree; i++)
00602         {
00603             for (int32_t j=1; j<=max_mismatch; j++)
00604             {
00605                 if (j<i+1)
00606                 {
00607                     int32_t nk=CMath::nchoosek(i+1, j);
00608                     weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00609                 }
00610                 else
00611                     weights[i+j*degree]= 0;
00612             }
00613         }
00614 
00615         if (which_degree>=0)
00616         {
00617             ASSERT(which_degree<degree);
00618             for (i=0; i<degree; i++)
00619             {
00620                 if (i!=which_degree)
00621                     weights[i]=0;
00622                 else
00623                     weights[i]=1;
00624             }
00625         }
00626         return true;
00627     }
00628     else
00629         return false;
00630 }
00631 
00632 bool CWeightedDegreeStringKernel::set_weights(
00633     float64_t* ws, int32_t d, int32_t len)
00634 {
00635     if (d!=degree || len<0)
00636         SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree);
00637 
00638     length=len;
00639 
00640     if (length==0)
00641         length=1;
00642 
00643     int32_t num_weights=degree*(length+max_mismatch);
00644     delete[] weights;
00645     weights=new float64_t[num_weights];
00646 
00647     if (weights)
00648     {
00649         for (int32_t i=0; i<num_weights; i++) {
00650             if (ws[i]) // len(ws) might be != num_weights?
00651                 weights[i]=ws[i];
00652         }
00653         return true;
00654     }
00655     else
00656         return false;
00657 }
00658 
00659 bool CWeightedDegreeStringKernel::set_position_weights(
00660     float64_t* pws, int32_t len)
00661 {
00662     if (len==0)
00663     {
00664         delete[] position_weights;
00665         position_weights=NULL;
00666         ASSERT(tries);
00667         tries->set_position_weights(position_weights);
00668     }
00669 
00670     if (seq_length!=len)
00671         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00672 
00673     delete[] position_weights;
00674     position_weights=new float64_t[len];
00675     ASSERT(tries);
00676     tries->set_position_weights(position_weights);
00677 
00678     if (position_weights)
00679     {
00680         for (int32_t i=0; i<len; i++)
00681             position_weights[i]=pws[i];
00682         return true;
00683     }
00684     else
00685         return false;
00686 }
00687 
00688 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00689 {
00690     delete[] block_weights;
00691     block_weights=new float64_t[CMath::max(seq_length,degree)];
00692 
00693     if (block_weights)
00694     {
00695         int32_t k;
00696         float64_t d=degree; // use float to evade rounding errors below
00697 
00698         for (k=0; k<degree; k++)
00699             block_weights[k]=
00700                 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00701         for (k=degree; k<seq_length; k++)
00702             block_weights[k]=(-d+3*k+4)/3;
00703     }
00704 
00705     return (block_weights!=NULL);
00706 }
00707 
00708 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00709 {
00710     ASSERT(weights);
00711     delete[] block_weights;
00712     block_weights=new float64_t[CMath::max(seq_length,degree)];
00713 
00714     if (block_weights)
00715     {
00716         int32_t i=0;
00717         block_weights[0]=weights[0];
00718         for (i=1; i<CMath::max(seq_length,degree); i++)
00719             block_weights[i]=0;
00720 
00721         for (i=1; i<CMath::max(seq_length,degree); i++)
00722         {
00723             block_weights[i]=block_weights[i-1];
00724 
00725             float64_t contrib=0;
00726             for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00727                 contrib+=weights[j];
00728 
00729             block_weights[i]+=contrib;
00730         }
00731     }
00732 
00733     return (block_weights!=NULL);
00734 }
00735 
00736 bool CWeightedDegreeStringKernel::init_block_weights_const()
00737 {
00738     delete[] block_weights;
00739     block_weights=new float64_t[seq_length];
00740 
00741     if (block_weights)
00742     {
00743         for (int32_t i=1; i<seq_length+1 ; i++)
00744             block_weights[i-1]=1.0/seq_length;
00745     }
00746 
00747     return (block_weights!=NULL);
00748 }
00749 
00750 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00751 {
00752     delete[] block_weights;
00753     block_weights=new float64_t[seq_length];
00754 
00755     if (block_weights)
00756     {
00757         for (int32_t i=1; i<seq_length+1 ; i++)
00758             block_weights[i-1]=degree*i;
00759     }
00760 
00761     return (block_weights!=NULL);
00762 }
00763 
00764 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00765 {
00766     delete[] block_weights;
00767     block_weights=new float64_t[seq_length];
00768 
00769     if (block_weights)
00770     {
00771         for (int32_t i=1; i<degree+1 ; i++)
00772             block_weights[i-1]=((float64_t) i)*i;
00773 
00774         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00775             block_weights[i-1]=i;
00776     }
00777 
00778     return (block_weights!=NULL);
00779 }
00780 
00781 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00782 {
00783     delete[] block_weights;
00784     block_weights=new float64_t[seq_length];
00785 
00786     if (block_weights)
00787     {
00788         for (int32_t i=1; i<degree+1 ; i++)
00789             block_weights[i-1]=((float64_t) i)*i*i;
00790 
00791         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00792             block_weights[i-1]=i;
00793     }
00794 
00795     return (block_weights!=NULL);
00796 }
00797 
00798 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00799 {
00800     delete[] block_weights;
00801     block_weights=new float64_t[seq_length];
00802 
00803     if (block_weights)
00804     {
00805         for (int32_t i=1; i<degree+1 ; i++)
00806             block_weights[i-1]=exp(((float64_t) i/10.0));
00807 
00808         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00809             block_weights[i-1]=i;
00810     }
00811 
00812     return (block_weights!=NULL);
00813 }
00814 
00815 bool CWeightedDegreeStringKernel::init_block_weights_log()
00816 {
00817     delete[] block_weights;
00818     block_weights=new float64_t[seq_length];
00819 
00820     if (block_weights)
00821     {
00822         for (int32_t i=1; i<degree+1 ; i++)
00823             block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00824 
00825         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00826             block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00827     }
00828 
00829     return (block_weights!=NULL);
00830 }
00831 
00832 bool CWeightedDegreeStringKernel::init_block_weights_external()
00833 {
00834     if (block_weights_external && (seq_length == num_block_weights_external) )
00835     {
00836         delete[] block_weights;
00837         block_weights=new float64_t[seq_length];
00838 
00839         if (block_weights)
00840         {
00841             for (int32_t i=0; i<seq_length; i++)
00842                 block_weights[i]=block_weights_external[i];
00843         }
00844     }
00845     else {
00846       SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00847    }
00848     return (block_weights!=NULL);
00849 }
00850 
00851 bool CWeightedDegreeStringKernel::init_block_weights()
00852 {
00853     switch (type)
00854     {
00855         case E_WD:
00856             return init_block_weights_from_wd();
00857         case E_EXTERNAL:
00858             return init_block_weights_from_wd_external();
00859         case E_BLOCK_CONST:
00860             return init_block_weights_const();
00861         case E_BLOCK_LINEAR:
00862             return init_block_weights_linear();
00863         case E_BLOCK_SQPOLY:
00864             return init_block_weights_sqpoly();
00865         case E_BLOCK_CUBICPOLY:
00866             return init_block_weights_cubicpoly();
00867         case E_BLOCK_EXP:
00868             return init_block_weights_exp();
00869         case E_BLOCK_LOG:
00870             return init_block_weights_log();
00871         case E_BLOCK_EXTERNAL:
00872             return init_block_weights_external();
00873         default:
00874             return false;
00875     };
00876 }
00877 
00878 
00879 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00880 {
00881     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00882     int32_t j=params->j;
00883     CWeightedDegreeStringKernel* wd=params->kernel;
00884     CTrie<DNATrie>* tries=params->tries;
00885     float64_t* weights=params->weights;
00886     int32_t length=params->length;
00887     int32_t* vec=params->vec;
00888     float64_t* result=params->result;
00889     float64_t factor=params->factor;
00890     int32_t* vec_idx=params->vec_idx;
00891 
00892     CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00893     CAlphabet* alpha=wd->alphabet;
00894 
00895     for (int32_t i=params->start; i<params->end; i++)
00896     {
00897         int32_t len=0;
00898         bool free_vec;
00899         char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00900         for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00901             vec[k]=alpha->remap_to_bin(char_vec[k]);
00902         rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00903 
00904         ASSERT(tries);
00905 
00906         result[i]+=factor*
00907             wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00908     }
00909 
00910     SG_UNREF(rhs_feat);
00911 
00912     return NULL;
00913 }
00914 
00915 void CWeightedDegreeStringKernel::compute_batch(
00916     int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00917     int32_t* IDX, float64_t* alphas, float64_t factor)
00918 {
00919     ASSERT(tries);
00920     ASSERT(alphabet);
00921     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00922     ASSERT(rhs);
00923     ASSERT(num_vec<=rhs->get_num_vectors());
00924     ASSERT(num_vec>0);
00925     ASSERT(vec_idx);
00926     ASSERT(result);
00927     create_empty_tries();
00928 
00929     int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00930     ASSERT(num_feat>0);
00931     int32_t num_threads=parallel->get_num_threads();
00932     ASSERT(num_threads>0);
00933     int32_t* vec=new int32_t[num_threads*num_feat];
00934 
00935     if (num_threads < 2)
00936     {
00937 #ifdef CYGWIN
00938         for (int32_t j=0; j<num_feat; j++)
00939 #else
00940         CSignal::clear_cancel();
00941         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00942 #endif
00943         {
00944             init_optimization(num_suppvec, IDX, alphas, j);
00945             S_THREAD_PARAM params;
00946             params.vec=vec;
00947             params.result=result;
00948             params.weights=weights;
00949             params.kernel=this;
00950             params.tries=tries;
00951             params.factor=factor;
00952             params.j=j;
00953             params.start=0;
00954             params.end=num_vec;
00955             params.length=length;
00956             params.vec_idx=vec_idx;
00957             compute_batch_helper((void*) &params);
00958 
00959             SG_PROGRESS(j,0,num_feat);
00960         }
00961     }
00962 #ifndef WIN32
00963     else
00964     {
00965         CSignal::clear_cancel();
00966         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00967         {
00968             init_optimization(num_suppvec, IDX, alphas, j);
00969             pthread_t* threads = new pthread_t[num_threads-1];
00970             S_THREAD_PARAM* params = new S_THREAD_PARAM[num_threads];
00971             int32_t step= num_vec/num_threads;
00972             int32_t t;
00973 
00974             for (t=0; t<num_threads-1; t++)
00975             {
00976                 params[t].vec=&vec[num_feat*t];
00977                 params[t].result=result;
00978                 params[t].weights=weights;
00979                 params[t].kernel=this;
00980                 params[t].tries=tries;
00981                 params[t].factor=factor;
00982                 params[t].j=j;
00983                 params[t].start = t*step;
00984                 params[t].end = (t+1)*step;
00985                 params[t].length=length;
00986                 params[t].vec_idx=vec_idx;
00987                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00988             }
00989             params[t].vec=&vec[num_feat*t];
00990             params[t].result=result;
00991             params[t].weights=weights;
00992             params[t].kernel=this;
00993             params[t].tries=tries;
00994             params[t].factor=factor;
00995             params[t].j=j;
00996             params[t].start=t*step;
00997             params[t].end=num_vec;
00998             params[t].length=length;
00999             params[t].vec_idx=vec_idx;
01000             compute_batch_helper((void*) &params[t]);
01001 
01002             for (t=0; t<num_threads-1; t++)
01003                 pthread_join(threads[t], NULL);
01004             SG_PROGRESS(j,0,num_feat);
01005 
01006             delete[] params;
01007             delete[] threads;
01008         }
01009     }
01010 #endif
01011 
01012     delete[] vec;
01013 
01014     //really also free memory as this can be huge on testing especially when
01015     //using the combined kernel
01016     create_empty_tries();
01017 }
01018 
01019 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
01020 {
01021     if (type==E_EXTERNAL && max!=0) {
01022         return false;
01023     }
01024 
01025     max_mismatch=max;
01026 
01027     if (lhs!=NULL && rhs!=NULL)
01028         return init(lhs, rhs);
01029     else
01030         return true;
01031 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation