00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _WEIGHTEDDEGREESTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREESTRINGKERNEL_H___
00014
00015 #include "lib/common.h"
00016 #include "lib/Trie.h"
00017 #include "kernel/StringKernel.h"
00018 #include "kernel/MultitaskKernelMklNormalizer.h"
00019 #include "features/StringFeatures.h"
00020
00021
00022
00023
00024
00025
00026 namespace shogun
00027 {
00028
00029 enum EWDKernType
00030 {
00031 E_WD=0,
00032 E_EXTERNAL=1,
00033
00034 E_BLOCK_CONST=2,
00035 E_BLOCK_LINEAR=3,
00036 E_BLOCK_SQPOLY=4,
00037 E_BLOCK_CUBICPOLY=5,
00038 E_BLOCK_EXP=6,
00039 E_BLOCK_LOG=7,
00040 E_BLOCK_EXTERNAL=8
00041 };
00042
00043
00058 class CWeightedDegreeStringKernel: public CStringKernel<char>
00059 {
00060 public:
00061
00067 CWeightedDegreeStringKernel(int32_t degree, EWDKernType type=E_WD);
00068
00074 CWeightedDegreeStringKernel(float64_t* weights, int32_t degree);
00075
00082 CWeightedDegreeStringKernel(
00083 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00084
00085 virtual ~CWeightedDegreeStringKernel();
00086
00093 virtual bool init(CFeatures* l, CFeatures* r);
00094
00096 virtual void cleanup();
00097
00105 EWDKernType get_type() const
00106 {
00107 return type;
00108 }
00109
00114 int32_t get_degree() const
00115 {
00116 return degree;
00117 }
00118
00124 int32_t get_max_mismatch() const
00125 {
00126 return max_mismatch;
00127 }
00128
00133 virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREE; }
00134
00139 virtual const char* get_name() const { return "WeightedDegree"; }
00140
00148 inline virtual bool init_optimization(
00149 int32_t count, int32_t *IDX, float64_t* alphas)
00150 {
00151 return init_optimization(count, IDX, alphas, -1);
00152 }
00153
00164 virtual bool init_optimization(
00165 int32_t count, int32_t *IDX, float64_t* alphas, int32_t tree_num);
00166
00171 virtual bool delete_optimization();
00172
00178 virtual float64_t compute_optimized(int32_t idx)
00179 {
00180 if (get_is_initialized())
00181 return compute_by_tree(idx);
00182
00183 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00184 return 0;
00185 }
00186
00191 static void* compute_batch_helper(void* p);
00192
00203 virtual void compute_batch(
00204 int32_t num_vec, int32_t* vec_idx, float64_t* target,
00205 int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00206 float64_t factor=1.0);
00207
00211 inline virtual void clear_normal()
00212 {
00213 if (get_is_initialized())
00214 {
00215
00216 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00217 SG_ERROR("not implemented");
00218
00219 tries->delete_trees(max_mismatch==0);
00220 set_is_initialized(false);
00221 }
00222 }
00223
00229 inline virtual void add_to_normal(int32_t idx, float64_t weight)
00230 {
00231
00232 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00233 SG_ERROR("not implemented");
00234
00235 if (max_mismatch==0)
00236 add_example_to_tree(idx, weight);
00237 else
00238 add_example_to_tree_mismatch(idx, weight);
00239
00240 set_is_initialized(true);
00241 }
00242
00247 inline virtual int32_t get_num_subkernels()
00248 {
00249 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00250 return ((CMultitaskKernelMklNormalizer*)normalizer)->get_num_betas();
00251 if (position_weights!=NULL)
00252 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00253 if (length==0)
00254 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00255 return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00256 }
00257
00263 inline void compute_by_subkernel(
00264 int32_t idx, float64_t * subkernel_contrib)
00265 {
00266
00267 if (get_is_initialized())
00268 {
00269
00270 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00271 SG_ERROR("not implemented");
00272
00273 compute_by_tree(idx, subkernel_contrib);
00274 return ;
00275 }
00276
00277 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00278 }
00279
00285 inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00286 {
00287
00288 num_weights = get_num_subkernels();
00289
00290 delete[] weights_buffer ;
00291 weights_buffer = new float64_t[num_weights];
00292
00293 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00294 for (int32_t i=0; i<num_weights; i++)
00295 weights_buffer[i] = ((CMultitaskKernelMklNormalizer*)normalizer)->get_beta(i);
00296 else if (position_weights!=NULL)
00297 for (int32_t i=0; i<num_weights; i++)
00298 weights_buffer[i] = position_weights[i*mkl_stepsize];
00299 else
00300 for (int32_t i=0; i<num_weights; i++)
00301 weights_buffer[i] = weights[i*mkl_stepsize];
00302
00303 return weights_buffer;
00304 }
00305
00311 inline void set_subkernel_weights(
00312 float64_t* weights2, int32_t num_weights2)
00313 {
00314 int32_t num_weights = get_num_subkernels();
00315 if (num_weights!=num_weights2)
00316 SG_ERROR( "number of weights do not match\n");
00317
00318
00319 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00320 for (int32_t i=0; i<num_weights; i++)
00321 ((CMultitaskKernelMklNormalizer*)normalizer)->set_beta(i, weights2[i]);
00322 else if (position_weights!=NULL)
00323 {
00324 for (int32_t i=0; i<num_weights; i++)
00325 {
00326 for (int32_t j=0; j<mkl_stepsize; j++)
00327 {
00328 if (i*mkl_stepsize+j<seq_length)
00329 position_weights[i*mkl_stepsize+j] = weights2[i];
00330 }
00331 }
00332 }
00333 else if (length==0)
00334 {
00335 for (int32_t i=0; i<num_weights; i++)
00336 {
00337 for (int32_t j=0; j<mkl_stepsize; j++)
00338 {
00339 if (i*mkl_stepsize+j<get_degree())
00340 weights[i*mkl_stepsize+j] = weights2[i];
00341 }
00342 }
00343 }
00344 else
00345 {
00346 for (int32_t i=0; i<num_weights; i++)
00347 {
00348 for (int32_t j=0; j<mkl_stepsize; j++)
00349 {
00350 if (i*mkl_stepsize+j<get_degree()*length)
00351 weights[i*mkl_stepsize+j] = weights2[i];
00352 }
00353 }
00354 }
00355 }
00356
00361 virtual bool set_normalizer(CKernelNormalizer* normalizer_) {
00362
00363 if (normalizer_ && strcmp(normalizer_->get_name(),"MultitaskKernelTreeNormalizer")==0) {
00364 unset_property(KP_LINADD);
00365 unset_property(KP_BATCHEVALUATION);
00366 }
00367 else
00368 {
00369 set_property(KP_LINADD);
00370 set_property(KP_BATCHEVALUATION);
00371 }
00372
00373
00374 return CStringKernel<char>::set_normalizer(normalizer_);
00375
00376 }
00377
00378
00384 float64_t *compute_abs_weights(int32_t & len);
00385
00392 void compute_by_tree(int32_t idx, float64_t *LevelContrib);
00393
00398 bool is_tree_initialized() { return tree_initialized; }
00399
00405 inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00406 {
00407 d=degree;
00408 len=length;
00409 return weights;
00410 }
00411
00417 inline float64_t *get_weights(int32_t& num_weights)
00418 {
00419
00420 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00421 SG_ERROR("not implemented");
00422
00423 if (position_weights!=NULL)
00424 {
00425 num_weights = seq_length ;
00426 return position_weights ;
00427 }
00428 if (length==0)
00429 num_weights = degree ;
00430 else
00431 num_weights = degree*length ;
00432 return weights;
00433 }
00434
00440 inline float64_t *get_position_weights(int32_t& len)
00441 {
00442 len=seq_length;
00443 return position_weights;
00444 }
00445
00451 bool set_wd_weights_by_type(EWDKernType type);
00452
00459 void set_wd_weights(float64_t* p_weights, int32_t d)
00460 {
00461 set_weights(p_weights,d,0);
00462 }
00463
00470 bool set_weights(float64_t* weights, int32_t d, int32_t len);
00471
00478 bool set_position_weights(float64_t* position_weights, int32_t len=0);
00479
00484 bool init_block_weights();
00485
00490 bool init_block_weights_from_wd();
00491
00496 bool init_block_weights_from_wd_external();
00497
00502 bool init_block_weights_const();
00503
00508 bool init_block_weights_linear();
00509
00514 bool init_block_weights_sqpoly();
00515
00520 bool init_block_weights_cubicpoly();
00521
00526 bool init_block_weights_exp();
00527
00532 bool init_block_weights_log();
00533
00538 bool init_block_weights_external();
00539
00544 bool delete_position_weights()
00545 {
00546 delete[] position_weights;
00547 position_weights=NULL;
00548 return true;
00549 }
00550
00556 bool set_max_mismatch(int32_t max);
00557
00562 inline int32_t get_max_mismatch() { return max_mismatch; }
00563
00569 inline bool set_degree(int32_t deg) { degree=deg; return true; }
00570
00575 inline int32_t get_degree() { return degree; }
00576
00582 inline bool set_use_block_computation(bool block)
00583 {
00584 block_computation=block;
00585 return true;
00586 }
00587
00592 inline bool get_use_block_computation() { return block_computation; }
00593
00599 inline bool set_mkl_stepsize(int32_t step)
00600 {
00601 if (step<1)
00602 SG_ERROR("Stepsize must be a positive integer\n");
00603 mkl_stepsize=step;
00604 return true;
00605 }
00606
00611 inline int32_t get_mkl_stepsize() { return mkl_stepsize; }
00612
00618 inline bool set_which_degree(int32_t which)
00619 {
00620 which_degree=which;
00621 return true;
00622 }
00623
00628 inline int32_t get_which_degree() { return which_degree; }
00629
00630 protected:
00632 void create_empty_tries();
00633
00639 void add_example_to_tree(int32_t idx, float64_t weight);
00640
00647 void add_example_to_single_tree(
00648 int32_t idx, float64_t weight, int32_t tree_num);
00649
00655 void add_example_to_tree_mismatch(int32_t idx, float64_t weight);
00656
00663 void add_example_to_single_tree_mismatch(
00664 int32_t idx, float64_t weight, int32_t tree_num);
00665
00671 float64_t compute_by_tree(int32_t idx);
00672
00681 float64_t compute(int32_t idx_a, int32_t idx_b);
00682
00691 float64_t compute_with_mismatch(
00692 char* avec, int32_t alen, char* bvec, int32_t blen);
00693
00702 float64_t compute_without_mismatch(
00703 char* avec, int32_t alen, char* bvec, int32_t blen);
00704
00713 float64_t compute_without_mismatch_matrix(
00714 char* avec, int32_t alen, char* bvec, int32_t blen);
00715
00724 float64_t compute_using_block(char* avec, int32_t alen,
00725 char* bvec, int32_t blen);
00726
00728 virtual void remove_lhs();
00729
00730
00731 #ifdef HAVE_BOOST_SERIALIZATION
00732
00733
00734
00735
00736 private:
00737
00738
00739
00740
00741 friend class ::boost::serialization::access;
00742
00743
00744
00745
00746 template<class Archive>
00747 void save(Archive & ar, const unsigned int archive_version) const
00748 {
00749
00750 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00751
00752 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00753
00754
00757 ar & mkl_stepsize ;
00758
00759 ar & length;
00760 ar & max_mismatch ;
00761
00762
00763
00764
00765
00766
00767
00768
00769 ar & seq_length ;
00770
00771 ar & initialized ;
00772 ar & block_computation;
00773
00774
00775
00776
00777 ar & num_block_weights_external;
00778 for (int32_t i=0; i < num_block_weights_external; ++i)
00779 {
00780 ar & block_weights_external[i];
00781 }
00782
00783
00784
00785
00786 ar & which_degree;
00787
00788
00789
00790
00791
00792
00793
00794
00795
00796
00797 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00798
00799 }
00800
00801 template<class Archive>
00802 void load(Archive & ar, const unsigned int archive_version)
00803 {
00804 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00805
00806 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00807
00808
00811 ar & mkl_stepsize ;
00812 ar & degree;
00813 ar & length;
00814 ar & max_mismatch ;
00815
00816
00817
00818
00819
00820
00821
00822
00823
00824
00825 ar & seq_length ;
00826
00827 ar & initialized ;
00828 ar & block_computation;
00829
00830
00831
00832
00833 ar & num_block_weights_external;
00834
00835 block_weights_external = new float64_t[num_block_weights_external];
00836 for (int32_t i=0; i < num_block_weights_external; ++i)
00837 {
00838 ar & block_weights_external[i];
00839 }
00840
00841
00842
00843
00844 ar & which_degree;
00845
00846
00847
00848
00849
00850 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00851
00852 }
00853
00854 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00855
00856
00857 public:
00858
00859 virtual std::string toString() const
00860 {
00861 std::ostringstream s;
00862
00863 ::boost::archive::text_oarchive oa(s);
00864
00865 oa << *this;
00866
00867 return s.str();
00868 }
00869
00870 virtual void fromString(std::string str)
00871 {
00872
00873 std::istringstream is(str);
00874
00875 ::boost::archive::text_iarchive ia(is);
00876
00877 ia >> *this;
00878
00879 }
00880
00881 #endif //HAVE_BOOST_SERIALIZATION
00882
00883
00884 protected:
00888 float64_t* weights;
00890 float64_t* position_weights;
00892 float64_t* weights_buffer;
00894 int32_t mkl_stepsize;
00896 int32_t degree;
00898 int32_t length;
00899
00901 int32_t max_mismatch;
00903 int32_t seq_length;
00904
00906 bool initialized;
00907
00909 bool block_computation;
00910
00912 int32_t num_block_weights_external;
00914 float64_t* block_weights_external;
00915
00917 float64_t* block_weights;
00919 EWDKernType type;
00921 int32_t which_degree;
00922
00924 CTrie<DNATrie>* tries;
00925
00927 bool tree_initialized;
00928
00930 CAlphabet* alphabet;
00931 };
00932
00933 }
00934
00935
00936
00937 #ifdef HAVE_BOOST_SERIALIZATION
00938 #include <boost/serialization/export.hpp>
00939
00940 #endif //HAVE_BOOST_SERIALIZATION
00941
00942
00943 #ifdef HAVE_BOOST_SERIALIZATION
00944
00945 namespace boost
00946 {
00947 namespace serialization
00948 {
00949 template<class Archive>
00950
00951 inline void save_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel* t, const unsigned int file_version)
00952 {
00953
00954 std::cout << "saving WDK from non-defaultconstruct data works" << std::endl;
00955
00956
00957
00958
00959
00960 ar << t->type;
00961
00962 ar << t->degree;
00963
00964
00965
00966
00967
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979
00980
00981
00982
00983
00984
00985
00986
00987 std::cout << "done saving WDK from non-defaultconstruct data" << std::endl;
00988
00989 }
00990
00991 template<class Archive>
00992 inline void load_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel * t, const unsigned int file_version)
00993 {
00994
00995 std::cout << "loading WDK from non-defaultconstruct data" << std::endl;
00996
00997
00998
00999 shogun::EWDKernType type;
01000 int32_t degree;
01001
01002 ar >> type;
01003 ar >> degree;
01004
01005
01006
01007
01008
01009
01010
01011
01012
01013
01014
01015
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025 ::new(t)shogun::CWeightedDegreeStringKernel(degree, type);
01026
01027
01028 std::cout << "done loading WDK from non-defaultconstruct data" << std::endl;
01029 }
01030 }
01031 }
01032 #endif //HAVE_BOOST_SERIALIZATION
01033
01034
01035
01036
01037 #endif