00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H___
00014
00015 #include "lib/common.h"
00016 #include "kernel/StringKernel.h"
00017 #include "kernel/WeightedDegreeStringKernel.h"
00018 #include "lib/Trie.h"
00019
00020 namespace shogun
00021 {
00022
00023 class CSVM;
00024
00048 class CWeightedDegreePositionStringKernel: public CStringKernel<char>
00049 {
00050 public:
00058 CWeightedDegreePositionStringKernel(
00059 int32_t size, int32_t degree,
00060 int32_t max_mismatch=0, int32_t mkl_stepsize=1);
00061
00072 CWeightedDegreePositionStringKernel(
00073 int32_t size, float64_t* weights, int32_t degree,
00074 int32_t max_mismatch, int32_t* shift, int32_t shift_len,
00075 int32_t mkl_stepsize=1);
00076
00083 CWeightedDegreePositionStringKernel(
00084 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00085
00086 virtual ~CWeightedDegreePositionStringKernel();
00087
00094 virtual bool init(CFeatures* l, CFeatures* r);
00095
00097 virtual void cleanup();
00098
00103 virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREEPOS; }
00104
00109 virtual const char* get_name() const { return "WeightedDegreePos"; }
00110
00118 inline virtual bool init_optimization(
00119 int32_t p_count, int32_t *IDX, float64_t * alphas)
00120 {
00121 return init_optimization(p_count, IDX, alphas, -1);
00122 }
00123
00135 virtual bool init_optimization(
00136 int32_t count, int32_t *IDX, float64_t * alphas, int32_t tree_num,
00137 int32_t upto_tree=-1);
00138
00143 virtual bool delete_optimization();
00144
00150 inline virtual float64_t compute_optimized(int32_t idx)
00151 {
00152 ASSERT(get_is_initialized());
00153 ASSERT(alphabet);
00154 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00155 return compute_by_tree(idx);
00156 }
00157
00162 static void* compute_batch_helper(void* p);
00163
00174 virtual void compute_batch(
00175 int32_t num_vec, int32_t* vec_idx, float64_t* target,
00176 int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00177 float64_t factor=1.0);
00178
00182 inline virtual void clear_normal()
00183 {
00184 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00185 {
00186 tries.set_use_compact_terminal_nodes(false) ;
00187 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00188 }
00189
00190 if (get_is_initialized())
00191 {
00192 if (opt_type==SLOWBUTMEMEFFICIENT)
00193 tries.delete_trees(true);
00194 else if (opt_type==FASTBUTMEMHUNGRY)
00195 tries.delete_trees(false);
00196 else
00197 SG_ERROR( "unknown optimization type\n");
00198
00199 set_is_initialized(false);
00200 }
00201 }
00202
00208 inline virtual void add_to_normal(int32_t idx, float64_t weight)
00209 {
00210 add_example_to_tree(idx, weight);
00211 set_is_initialized(true);
00212 }
00213
00218 inline virtual int32_t get_num_subkernels()
00219 {
00220 if (position_weights!=NULL)
00221 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00222 if (length==0)
00223 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00224 return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00225 }
00226
00232 inline void compute_by_subkernel(
00233 int32_t idx, float64_t * subkernel_contrib)
00234 {
00235 if (get_is_initialized())
00236 {
00237 compute_by_tree(idx, subkernel_contrib);
00238 return ;
00239 }
00240
00241 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not initialized\n") ;
00242 }
00243
00249 inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00250 {
00251 num_weights = get_num_subkernels() ;
00252
00253 delete[] weights_buffer ;
00254 weights_buffer = new float64_t[num_weights] ;
00255
00256 if (position_weights!=NULL)
00257 for (int32_t i=0; i<num_weights; i++)
00258 weights_buffer[i] = position_weights[i*mkl_stepsize] ;
00259 else
00260 for (int32_t i=0; i<num_weights; i++)
00261 weights_buffer[i] = weights[i*mkl_stepsize] ;
00262
00263 return weights_buffer ;
00264 }
00265
00271 inline void set_subkernel_weights(
00272 float64_t* weights2, int32_t num_weights2)
00273 {
00274 int32_t num_weights = get_num_subkernels() ;
00275 if (num_weights!=num_weights2)
00276 SG_ERROR( "number of weights do not match\n") ;
00277
00278 if (position_weights!=NULL)
00279 for (int32_t i=0; i<num_weights; i++)
00280 for (int32_t j=0; j<mkl_stepsize; j++)
00281 {
00282 if (i*mkl_stepsize+j<seq_length)
00283 position_weights[i*mkl_stepsize+j] = weights2[i] ;
00284 }
00285 else if (length==0)
00286 {
00287 for (int32_t i=0; i<num_weights; i++)
00288 for (int32_t j=0; j<mkl_stepsize; j++)
00289 if (i*mkl_stepsize+j<get_degree())
00290 weights[i*mkl_stepsize+j] = weights2[i] ;
00291 }
00292 else
00293 {
00294 for (int32_t i=0; i<num_weights; i++)
00295 for (int32_t j=0; j<mkl_stepsize; j++)
00296 if (i*mkl_stepsize+j<get_degree()*length)
00297 weights[i*mkl_stepsize+j] = weights2[i] ;
00298 }
00299 }
00300
00301
00307 float64_t* compute_abs_weights(int32_t & len);
00308
00313 bool is_tree_initialized() { return tree_initialized; }
00314
00319 inline int32_t get_max_mismatch() { return max_mismatch; }
00320
00325 inline int32_t get_degree() { return degree; }
00326
00332 inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00333 {
00334 d=degree;
00335 len=length;
00336 return weights;
00337 }
00338
00344 inline float64_t *get_weights(int32_t& num_weights)
00345 {
00346 if (position_weights!=NULL)
00347 {
00348 num_weights = seq_length ;
00349 return position_weights ;
00350 }
00351 if (length==0)
00352 num_weights = degree ;
00353 else
00354 num_weights = degree*length ;
00355 return weights;
00356 }
00357
00363 inline float64_t *get_position_weights(int32_t& len)
00364 {
00365 len=seq_length;
00366 return position_weights;
00367 }
00368
00374 bool set_shifts(int32_t* shifts, int32_t len);
00375
00382 virtual bool set_weights(float64_t* weights, int32_t d, int32_t len=0);
00383
00388 virtual bool set_wd_weights();
00389
00396 virtual bool set_position_weights(float64_t* pws, int32_t len);
00397
00405 bool set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num);
00406
00414 bool set_position_weights_rhs(float64_t* pws, int32_t len, int32_t num);
00415
00420 bool init_block_weights();
00421
00426 bool init_block_weights_from_wd();
00427
00432 bool init_block_weights_from_wd_external();
00433
00438 bool init_block_weights_const();
00439
00444 bool init_block_weights_linear();
00445
00450 bool init_block_weights_sqpoly();
00451
00456 bool init_block_weights_cubicpoly();
00457
00462 bool init_block_weights_exp();
00463
00468 bool init_block_weights_log();
00469
00474 bool init_block_weights_external();
00475
00480 bool delete_position_weights()
00481 {
00482 delete[] position_weights;
00483 position_weights=NULL;
00484 return true;
00485 }
00486
00491 bool delete_position_weights_lhs()
00492 {
00493 delete[] position_weights_lhs;
00494 position_weights_lhs=NULL;
00495 return true;
00496 }
00497
00502 bool delete_position_weights_rhs()
00503 {
00504 delete[] position_weights_rhs;
00505 position_weights_rhs=NULL;
00506 return true;
00507 }
00508
00514 virtual float64_t compute_by_tree(int32_t idx);
00515
00521 virtual void compute_by_tree(int32_t idx, float64_t* LevelContrib);
00522
00535 float64_t* compute_scoring(
00536 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
00537 float64_t* target, int32_t num_suppvec, int32_t* IDX,
00538 float64_t* weights);
00539
00548 char* compute_consensus(
00549 int32_t &num_feat, int32_t num_suppvec, int32_t* IDX,
00550 float64_t* alphas);
00551
00563 float64_t* extract_w(
00564 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
00565 float64_t* w_result, int32_t num_suppvec, int32_t* IDX,
00566 float64_t* alphas);
00567
00580 float64_t* compute_POIM(
00581 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
00582 float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
00583 float64_t* alphas, float64_t* distrib);
00584
00591 void prepare_POIM2(
00592 float64_t* distrib, int32_t num_sym, int32_t num_feat);
00593
00600 void compute_POIM2(int32_t max_degree, CSVM* svm);
00601
00607 void get_POIM2(float64_t** poim, int32_t* result_len);
00608
00610 void cleanup_POIM2();
00611
00612 protected:
00614 void create_empty_tries();
00615
00621 virtual void add_example_to_tree(
00622 int32_t idx, float64_t weight);
00623
00630 void add_example_to_single_tree(
00631 int32_t idx, float64_t weight, int32_t tree_num);
00632
00641 virtual float64_t compute(int32_t idx_a, int32_t idx_b);
00642
00651 float64_t compute_with_mismatch(
00652 char* avec, int32_t alen, char* bvec, int32_t blen);
00653
00662 float64_t compute_without_mismatch(
00663 char* avec, int32_t alen, char* bvec, int32_t blen);
00664
00673 float64_t compute_without_mismatch_matrix(
00674 char* avec, int32_t alen, char* bvec, int32_t blen);
00675
00686 float64_t compute_without_mismatch_position_weights(
00687 char* avec, float64_t *posweights_lhs, int32_t alen,
00688 char* bvec, float64_t *posweights_rhs, int32_t blen);
00689
00691 virtual void remove_lhs();
00692
00693 protected:
00695 float64_t* weights;
00697 float64_t* position_weights;
00699 float64_t* position_weights_lhs;
00701 float64_t* position_weights_rhs;
00703 bool* position_mask;
00704
00706 float64_t* weights_buffer;
00708 int32_t mkl_stepsize;
00709
00711 int32_t degree;
00713 int32_t length;
00714
00716 int32_t max_mismatch;
00718 int32_t seq_length;
00719
00721 int32_t *shift;
00723 int32_t shift_len;
00725 int32_t max_shift;
00726
00728 bool block_computation;
00729
00731 int32_t num_block_weights_external;
00733 float64_t* block_weights_external;
00734
00736 float64_t* block_weights;
00738 EWDKernType type;
00740 int32_t which_degree;
00741
00743 CTrie<DNATrie> tries;
00745 CTrie<POIMTrie> poim_tries;
00746
00748 bool tree_initialized;
00750 bool use_poim_tries;
00751
00753 float64_t* m_poim_distrib;
00755 float64_t* m_poim;
00756
00758 int32_t m_poim_num_sym;
00760 int32_t m_poim_num_feat;
00762 int32_t m_poim_result_len;
00763
00765 CAlphabet* alphabet;
00766 };
00767 }
00768 #endif