WeightedDegreeStringKernel.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _WEIGHTEDDEGREESTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREESTRINGKERNEL_H___
00014 
00015 #include "lib/common.h"
00016 #include "lib/Trie.h"
00017 #include "kernel/StringKernel.h"
00018 #include "kernel/MultitaskKernelMklNormalizer.h"
00019 #include "features/StringFeatures.h"
00020 
00021 
00022 
00023 
00024 
00025 
00026 namespace shogun
00027 {
00028 
00029 enum EWDKernType
00030 {
00031     E_WD=0,
00032     E_EXTERNAL=1,
00033 
00034     E_BLOCK_CONST=2,
00035     E_BLOCK_LINEAR=3,
00036     E_BLOCK_SQPOLY=4,
00037     E_BLOCK_CUBICPOLY=5,
00038     E_BLOCK_EXP=6,
00039     E_BLOCK_LOG=7,
00040     E_BLOCK_EXTERNAL=8
00041 };
00042 
00043 
00058 class CWeightedDegreeStringKernel: public CStringKernel<char>
00059 {
00060     public:
00061 
00067         CWeightedDegreeStringKernel(int32_t degree, EWDKernType type=E_WD);
00068 
00074         CWeightedDegreeStringKernel(float64_t* weights, int32_t degree);
00075 
00082         CWeightedDegreeStringKernel(
00083             CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00084 
00085         virtual ~CWeightedDegreeStringKernel();
00086 
00093         virtual bool init(CFeatures* l, CFeatures* r);
00094 
00096         virtual void cleanup();
00097 
00105         EWDKernType get_type() const
00106         {
00107             return type;
00108         }
00109 
00114         int32_t get_degree() const
00115         {
00116             return degree;
00117         }
00118 
00124         int32_t get_max_mismatch() const
00125         {
00126             return max_mismatch;
00127         }
00128 
00133         virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREE; }
00134 
00139         virtual const char* get_name() const { return "WeightedDegree"; }
00140 
00148         inline virtual bool init_optimization(
00149             int32_t count, int32_t *IDX, float64_t* alphas)
00150         {
00151             return init_optimization(count, IDX, alphas, -1);
00152         }
00153 
00164         virtual bool init_optimization(
00165             int32_t count, int32_t *IDX, float64_t* alphas, int32_t tree_num);
00166 
00171         virtual bool delete_optimization();
00172 
00178         virtual float64_t compute_optimized(int32_t idx)
00179         {
00180             if (get_is_initialized())
00181                 return compute_by_tree(idx);
00182 
00183             SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00184             return 0;
00185         }
00186 
00191         static void* compute_batch_helper(void* p);
00192 
00203         virtual void compute_batch(
00204             int32_t num_vec, int32_t* vec_idx, float64_t* target,
00205             int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00206             float64_t factor=1.0);
00207 
00211         inline virtual void clear_normal()
00212         {
00213             if (get_is_initialized())
00214             {
00215 
00216                 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00217                     SG_ERROR("not implemented");
00218 
00219                 tries->delete_trees(max_mismatch==0);
00220                 set_is_initialized(false);
00221             }
00222         }
00223 
00229         inline virtual void add_to_normal(int32_t idx, float64_t weight)
00230         {
00231 
00232             if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00233                 SG_ERROR("not implemented");
00234 
00235             if (max_mismatch==0)
00236                 add_example_to_tree(idx, weight);
00237             else
00238                 add_example_to_tree_mismatch(idx, weight);
00239 
00240             set_is_initialized(true);
00241         }
00242 
00247         inline virtual int32_t get_num_subkernels()
00248         {
00249             if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00250                 return ((CMultitaskKernelMklNormalizer*)normalizer)->get_num_betas();
00251             if (position_weights!=NULL)
00252                 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00253             if (length==0)
00254                 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00255             return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00256         }
00257 
00263         inline void compute_by_subkernel(
00264             int32_t idx, float64_t * subkernel_contrib)
00265         {
00266 
00267             if (get_is_initialized())
00268             {
00269 
00270                 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00271                     SG_ERROR("not implemented");
00272 
00273                 compute_by_tree(idx, subkernel_contrib);
00274                 return ;
00275             }
00276 
00277             SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00278         }
00279 
00285         inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00286         {
00287 
00288             num_weights = get_num_subkernels();
00289 
00290             delete[] weights_buffer ;
00291             weights_buffer = new float64_t[num_weights];
00292 
00293             if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00294                 for (int32_t i=0; i<num_weights; i++)
00295                     weights_buffer[i] = ((CMultitaskKernelMklNormalizer*)normalizer)->get_beta(i);
00296             else if (position_weights!=NULL)
00297                 for (int32_t i=0; i<num_weights; i++)
00298                     weights_buffer[i] = position_weights[i*mkl_stepsize];
00299             else
00300                 for (int32_t i=0; i<num_weights; i++)
00301                     weights_buffer[i] = weights[i*mkl_stepsize];
00302 
00303             return weights_buffer;
00304         }
00305 
00311         inline void set_subkernel_weights(
00312             float64_t* weights2, int32_t num_weights2)
00313         {
00314             int32_t num_weights = get_num_subkernels();
00315             if (num_weights!=num_weights2)
00316                 SG_ERROR( "number of weights do not match\n");
00317 
00318 
00319             if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00320                 for (int32_t i=0; i<num_weights; i++)
00321                     ((CMultitaskKernelMklNormalizer*)normalizer)->set_beta(i, weights2[i]);
00322             else if (position_weights!=NULL)
00323             {
00324                 for (int32_t i=0; i<num_weights; i++)
00325                 {
00326                     for (int32_t j=0; j<mkl_stepsize; j++)
00327                     {
00328                         if (i*mkl_stepsize+j<seq_length)
00329                             position_weights[i*mkl_stepsize+j] = weights2[i];
00330                     }
00331                 }
00332             }
00333             else if (length==0)
00334             {
00335                 for (int32_t i=0; i<num_weights; i++)
00336                 {
00337                     for (int32_t j=0; j<mkl_stepsize; j++)
00338                     {
00339                         if (i*mkl_stepsize+j<get_degree())
00340                             weights[i*mkl_stepsize+j] = weights2[i];
00341                     }
00342                 }
00343             }
00344             else
00345             {
00346                 for (int32_t i=0; i<num_weights; i++)
00347                 {
00348                     for (int32_t j=0; j<mkl_stepsize; j++)
00349                     {
00350                         if (i*mkl_stepsize+j<get_degree()*length)
00351                             weights[i*mkl_stepsize+j] = weights2[i];
00352                     }
00353                 }
00354             }
00355         }
00356 
00361         virtual bool set_normalizer(CKernelNormalizer* normalizer_) {
00362 
00363             if (normalizer_ && strcmp(normalizer_->get_name(),"MultitaskKernelTreeNormalizer")==0) {
00364                 unset_property(KP_LINADD);
00365                 unset_property(KP_BATCHEVALUATION);
00366             }
00367             else
00368             {
00369                 set_property(KP_LINADD);
00370                 set_property(KP_BATCHEVALUATION);
00371             }
00372 
00373 
00374             return CStringKernel<char>::set_normalizer(normalizer_);
00375 
00376         }
00377 
00378         // other kernel tree operations
00384         float64_t *compute_abs_weights(int32_t & len);
00385 
00392         void compute_by_tree(int32_t idx, float64_t *LevelContrib);
00393 
00398         bool is_tree_initialized() { return tree_initialized; }
00399 
00405         inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00406         {
00407             d=degree;
00408             len=length;
00409             return weights;
00410         }
00411 
00417         inline float64_t *get_weights(int32_t& num_weights)
00418         {
00419 
00420             if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK)
00421                 SG_ERROR("not implemented");
00422 
00423             if (position_weights!=NULL)
00424             {
00425                 num_weights = seq_length ;
00426                 return position_weights ;
00427             }
00428             if (length==0)
00429                 num_weights = degree ;
00430             else
00431                 num_weights = degree*length ;
00432             return weights;
00433         }
00434 
00440         inline float64_t *get_position_weights(int32_t& len)
00441         {
00442             len=seq_length;
00443             return position_weights;
00444         }
00445 
00451         bool set_wd_weights_by_type(EWDKernType type);
00452 
00459         void set_wd_weights(float64_t* p_weights, int32_t d)
00460         {
00461             set_weights(p_weights,d,0);
00462         }
00463 
00470         bool set_weights(float64_t* weights, int32_t d, int32_t len);
00471 
00478         bool set_position_weights(float64_t* position_weights, int32_t len=0);
00479 
00484         bool init_block_weights();
00485 
00490         bool init_block_weights_from_wd();
00491 
00496         bool init_block_weights_from_wd_external();
00497 
00502         bool init_block_weights_const();
00503 
00508         bool init_block_weights_linear();
00509 
00514         bool init_block_weights_sqpoly();
00515 
00520         bool init_block_weights_cubicpoly();
00521 
00526         bool init_block_weights_exp();
00527 
00532         bool init_block_weights_log();
00533 
00538         bool init_block_weights_external();
00539 
00544         bool delete_position_weights()
00545         {
00546             delete[] position_weights;
00547             position_weights=NULL;
00548             return true;
00549         }
00550 
00556         bool set_max_mismatch(int32_t max);
00557 
00562         inline int32_t get_max_mismatch() { return max_mismatch; }
00563 
00569         inline bool set_degree(int32_t deg) { degree=deg; return true; }
00570 
00575         inline int32_t get_degree() { return degree; }
00576 
00582         inline bool set_use_block_computation(bool block)
00583         {
00584             block_computation=block;
00585             return true;
00586         }
00587 
00592         inline bool get_use_block_computation() { return block_computation; }
00593 
00599         inline bool set_mkl_stepsize(int32_t step)
00600         {
00601             if (step<1)
00602                 SG_ERROR("Stepsize must be a positive integer\n");
00603             mkl_stepsize=step;
00604             return true;
00605         }
00606 
00611         inline int32_t get_mkl_stepsize() { return mkl_stepsize; }
00612 
00618         inline bool set_which_degree(int32_t which)
00619         {
00620             which_degree=which;
00621             return true;
00622         }
00623 
00628         inline int32_t get_which_degree() { return which_degree; }
00629 
00630     protected:
00632         void create_empty_tries();
00633 
00639         void add_example_to_tree(int32_t idx, float64_t weight);
00640 
00647         void add_example_to_single_tree(
00648             int32_t idx, float64_t weight, int32_t tree_num);
00649 
00655         void add_example_to_tree_mismatch(int32_t idx, float64_t weight);
00656 
00663         void add_example_to_single_tree_mismatch(
00664             int32_t idx, float64_t weight, int32_t tree_num);
00665 
00671         float64_t compute_by_tree(int32_t idx);
00672 
00681         float64_t compute(int32_t idx_a, int32_t idx_b);
00682 
00691         float64_t compute_with_mismatch(
00692             char* avec, int32_t alen, char* bvec, int32_t blen);
00693 
00702         float64_t compute_without_mismatch(
00703             char* avec, int32_t alen, char* bvec, int32_t blen);
00704 
00713         float64_t compute_without_mismatch_matrix(
00714             char* avec, int32_t alen, char* bvec, int32_t blen);
00715 
00724         float64_t compute_using_block(char* avec, int32_t alen,
00725             char* bvec, int32_t blen);
00726 
00728         virtual void remove_lhs();
00729 
00730 
00731 #ifdef HAVE_BOOST_SERIALIZATION
00732 
00733 
00734 
00735 
00736     private:
00737 
00738         // serialization needs to split up in save/load because
00739         // the serialization of pointers to natives (int* & friends)
00740         // requires a workaround
00741         friend class ::boost::serialization::access;
00742 
00743         //  friend std::ostream & operator<<(std::ostream &os, const CWeightedDegreeStringKernel &gp);
00744         //template<class Archive>
00745         //friend void ::boost::serialization::save_construct_data(Archive & ar, const CWeightedDegreeStringKernel* t, const unsigned int file_version);
00746         template<class Archive>
00747             void save(Archive & ar, const unsigned int archive_version) const
00748             {
00749 
00750                 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00751 
00752                 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00753 
00754 
00757                 ar & mkl_stepsize ;
00758                 //ar & degree;
00759                 ar & length;
00760                 ar & max_mismatch ;
00761 
00762                 //for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00763                 //    ar & weights[i];
00764 
00765                 //TODO how long?
00766                 //float64_t* position_weights ;
00767                 //float64_t* weights_buffer ;
00768 
00769                 ar & seq_length ;
00770 
00771                 ar & initialized ;
00772                 ar & block_computation;
00773                 //ar & use_normalization ;
00774 
00775                 //ar & normalization_const;
00776 
00777                 ar & num_block_weights_external;
00778                 for (int32_t i=0; i < num_block_weights_external; ++i)
00779                 {
00780                     ar & block_weights_external[i];
00781                 }
00782 
00783                 //TODO how long
00784                 //float64_t* block_weights;
00785                 //ar & type;
00786                 ar & which_degree;
00787 
00788                 //TODO implement
00789                 //CTrie<DNATrie> tries ;
00790                 //ar & tree_initialized ;
00791 
00792 
00793                 //CWeightedDegreeStringKernel* tmp = const_cast<CWeightedDegreeStringKernel*>(this);
00794                 //tmp->create_empty_tries();
00795                 //create_empty_tries();
00796 
00797                 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00798 
00799             }
00800 
00801         template<class Archive>
00802             void load(Archive & ar, const unsigned int archive_version)
00803             {
00804                 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00805 
00806                 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00807 
00808 
00811                 ar & mkl_stepsize ;
00812                 ar & degree;
00813                 ar & length;
00814                 ar & max_mismatch ;
00815 
00816                 //weights=new float64_t[degree*(1+max_mismatch)];
00817                 //for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00818                 //    ar & weights[i];
00819 
00820 
00821                 //TODO how long?
00822                 //float64_t* position_weights ;
00823                 //float64_t* weights_buffer ;
00824 
00825                 ar & seq_length ;
00826 
00827                 ar & initialized ;
00828                 ar & block_computation;
00829                 //ar & use_normalization ;
00830 
00831                 //ar & normalization_const;
00832 
00833                 ar & num_block_weights_external;
00834                 //float64_t* block_weights_external;
00835                 block_weights_external = new float64_t[num_block_weights_external];
00836                 for (int32_t i=0; i < num_block_weights_external; ++i)
00837                 {
00838                     ar & block_weights_external[i];
00839                 }
00840 
00841                 //TODO how long
00842                 //float64_t* block_weights;
00843                 //ar & type;
00844                 ar & which_degree;
00845 
00846                 //TODO implement
00847                 //CTrie<DNATrie> tries ;
00848                 //ar & tree_initialized ;
00849 
00850                 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00851 
00852             }
00853 
00854         GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00855 
00856 
00857     public:
00858 
00859         virtual std::string toString() const
00860         {
00861             std::ostringstream s;
00862 
00863             ::boost::archive::text_oarchive oa(s);
00864 
00865             oa << *this;
00866 
00867             return s.str();
00868         }
00869 
00870         virtual void fromString(std::string str)
00871         {
00872 
00873             std::istringstream is(str);
00874 
00875             ::boost::archive::text_iarchive ia(is);
00876 
00877             ia >> *this;
00878 
00879         }
00880 
00881 #endif //HAVE_BOOST_SERIALIZATION
00882 
00883 
00884     protected:
00888         float64_t* weights;
00890         float64_t* position_weights;
00892         float64_t* weights_buffer;
00894         int32_t mkl_stepsize;
00896         int32_t degree;
00898         int32_t length;
00899 
00901         int32_t max_mismatch;
00903         int32_t seq_length;
00904 
00906         bool initialized;
00907 
00909         bool block_computation;
00910 
00912         int32_t num_block_weights_external;
00914         float64_t* block_weights_external;
00915 
00917         float64_t* block_weights;
00919         EWDKernType type;
00921         int32_t which_degree;
00922 
00924         CTrie<DNATrie>* tries;
00925 
00927         bool tree_initialized;
00928 
00930         CAlphabet* alphabet;
00931 };
00932 
00933 }
00934 
00935 
00936 
00937 #ifdef HAVE_BOOST_SERIALIZATION
00938 #include <boost/serialization/export.hpp>
00939 
00940 #endif //HAVE_BOOST_SERIALIZATION
00941 
00942 
00943 #ifdef HAVE_BOOST_SERIALIZATION
00944 
00945 namespace boost
00946 {
00947     namespace serialization
00948     {
00949         template<class Archive>
00950             //inline void save_construct_data(Archive & ar, const shogun::CWeightedDegreeStringKernel* const t, const unsigned int file_version)
00951             inline void save_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel* t, const unsigned int file_version)
00952             {
00953 
00954                 std::cout << "saving WDK from non-defaultconstruct data works" << std::endl;
00955 
00956                 //CWeightedDegreeStringKernel(INT size, EWDKernType type, INT degree, INT max_mismatch, bool use_normalization=true, bool block_computation=false, INT mkl_stepsize=1, INT which_deg=-1);
00957 
00958                 //ar << t->cache_size;
00959 
00960                 ar << t->type;
00961 
00962                 ar << t->degree;
00963 
00964                 //ar << t->max_mismatch;
00965                 /*
00966 
00967                 ar.register_type(static_cast<shogun::CStringFeatures<char> *>(NULL));
00968 
00969 
00970                 const shogun::CStringFeatures<char>* const lhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_lhs());
00971 
00972                 const shogun::CStringFeatures<char>* const rhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_rhs());
00973 
00974                 //CStringFeatures<char>* lhs = (CStringFeatures<char>*) (const_cast<CWeightedDegreeStringKernel*>(t)->get_lhs());
00975                 //CStringFeatures<char>* rhs = (CStringFeatures<char>*) (const_cast<CWeightedDegreeStringKernel*>(t)->get_rhs());
00976 
00977                 //    const CFeatures* const lhs = t->get_lhs();
00978                 //    const CFeatures* const rhs = t->get_rhs();
00979 
00980                 ar << lhs;
00981                 ar << rhs;
00982 
00983                 //ar << dynamic_cast<CStringFeatures<char>*>(rhs);
00984                 //ar << t->get_lhs();
00985                 //ar << t->get_rhs();
00986                 */
00987                 std::cout << "done saving WDK from non-defaultconstruct data" << std::endl;
00988 
00989             }
00990 
00991         template<class Archive>
00992             inline void load_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel * t, const unsigned int file_version)
00993             {
00994 
00995                 std::cout << "loading WDK from non-defaultconstruct data" << std::endl;
00996 
00997 
00998 
00999                 shogun::EWDKernType type;
01000                 int32_t degree;
01001 
01002                 ar >> type;
01003                 ar >> degree;
01004                 /*
01005                 int32_t size;
01006                 int32_t max_mismatch;
01007 
01008                 ar >> size;
01009                 ar >> type;
01010                 ar >> degree;
01011                 ar >> max_mismatch;
01012 
01013                 //      ::new(t)CWeightedDegreeStringKernel(size, type, degree, max_mismatch);
01014 
01015                 shogun::CStringFeatures<char>* lhs;
01016                 shogun::CStringFeatures<char>* rhs;
01017 
01018 
01019                 ar >> lhs;
01020                 ar >> rhs;
01021 
01022                 ::new(t)shogun::CWeightedDegreeStringKernel(lhs, rhs, degree);
01023                 */
01024 
01025                 ::new(t)shogun::CWeightedDegreeStringKernel(degree, type);
01026                 //t->set_max_mismatch(max_mismatch);
01027 
01028                 std::cout << "done loading WDK from non-defaultconstruct data" << std::endl;
01029             }
01030     } // serialization
01031 } // namespace boost
01032 #endif //HAVE_BOOST_SERIALIZATION
01033 
01034 //BOOST_CLASS_EXPORT_KEY2(shogun::CWeightedDegreeStringKernel, "CWeightedDegreeStringKernel");
01035 
01036 
01037 #endif /* _WEIGHTEDDEGREESTRINGKERNEL_H__ */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation