WeightedCommWordStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "lib/common.h"
00012 #include "kernel/WeightedCommWordStringKernel.h"
00013 #include "features/StringFeatures.h"
00014 #include "lib/io.h"
00015 
00016 using namespace shogun;
00017 
00018 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00019     int32_t size, bool us)
00020 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00021 {
00022     init_dictionary(1<<(sizeof(uint16_t)*9));
00023     ASSERT(us==false);
00024 }
00025 
00026 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00027     CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us,
00028     int32_t size)
00029 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00030 {
00031     init_dictionary(1<<(sizeof(uint16_t)*9));
00032     ASSERT(us==false);
00033 
00034     init(l,r);
00035 }
00036 
00037 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel()
00038 {
00039     delete[] weights;
00040 }
00041 
00042 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00043 {
00044     ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
00045             ((CStringFeatures<uint16_t>*) r)->get_order());
00046     degree=((CStringFeatures<uint16_t>*) l)->get_order();
00047     set_wd_weights();
00048 
00049     CCommWordStringKernel::init(l,r);
00050     return init_normalizer();
00051 }
00052 
00053 void CWeightedCommWordStringKernel::cleanup()
00054 {
00055     delete[] weights;
00056     weights=NULL;
00057 
00058     CCommWordStringKernel::cleanup();
00059 }
00060 
00061 bool CWeightedCommWordStringKernel::set_wd_weights()
00062 {
00063     delete[] weights;
00064     weights=new float64_t[degree];
00065 
00066     int32_t i;
00067     float64_t sum=0;
00068     for (i=0; i<degree; i++)
00069     {
00070         weights[i]=degree-i;
00071         sum+=weights[i];
00072     }
00073     for (i=0; i<degree; i++)
00074         weights[i]=CMath::sqrt(weights[i]/sum);
00075 
00076     return weights!=NULL;
00077 }
00078 
00079 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d)
00080 {
00081     ASSERT(d==degree);
00082 
00083     delete[] weights;
00084     weights=new float64_t[degree];
00085     for (int32_t i=0; i<degree; i++)
00086         weights[i]=CMath::sqrt(w[i]);
00087     return true;
00088 }
00089   
00090 float64_t CWeightedCommWordStringKernel::compute_helper(
00091     int32_t idx_a, int32_t idx_b, bool do_sort)
00092 {
00093     int32_t alen, blen;
00094     bool free_avec, free_bvec;
00095 
00096     CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00097     CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00098 
00099     uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec);
00100     uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec);
00101 
00102     uint16_t* avec=av;
00103     uint16_t* bvec=bv;
00104 
00105     if (do_sort)
00106     {
00107         if (alen>0)
00108         {
00109             avec=new uint16_t[alen];
00110             memcpy(avec, av, sizeof(uint16_t)*alen);
00111             CMath::radix_sort(avec, alen);
00112         }
00113         else
00114             avec=NULL;
00115 
00116         if (blen>0)
00117         {
00118             bvec=new uint16_t[blen];
00119             memcpy(bvec, bv, sizeof(uint16_t)*blen);
00120             CMath::radix_sort(bvec, blen);
00121         }
00122         else
00123             bvec=NULL;
00124     }
00125     else
00126     {
00127         if ( (l->get_num_preproc() != l->get_num_preprocessed()) ||
00128                 (r->get_num_preproc() != r->get_num_preprocessed()))
00129         {
00130             SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00131                     " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(),
00132                     r->get_num_preprocessed(), r->get_num_preproc());
00133         }
00134     }
00135 
00136     float64_t result=0;
00137     uint8_t mask=0;
00138 
00139     for (int32_t d=0; d<degree; d++)
00140     {
00141         mask = mask | (1 << (degree-d-1));
00142         uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
00143 
00144         int32_t left_idx=0;
00145         int32_t right_idx=0;
00146         float64_t weight=weights[d]*weights[d];
00147 
00148         while (left_idx < alen && right_idx < blen)
00149         {
00150             uint16_t lsym=avec[left_idx] & masked;
00151             uint16_t rsym=bvec[right_idx] & masked;
00152 
00153             if (lsym == rsym)
00154             {
00155                 int32_t old_left_idx=left_idx;
00156                 int32_t old_right_idx=right_idx;
00157 
00158                 while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
00159                     left_idx++;
00160 
00161                 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
00162                     right_idx++;
00163 
00164                 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00165             }
00166             else if (lsym<rsym)
00167                 left_idx++;
00168             else
00169                 right_idx++;
00170         }
00171     }
00172 
00173     if (do_sort)
00174     {
00175         delete[] avec;
00176         delete[] bvec;
00177     }
00178 
00179     l->free_feature_vector(av, idx_a, free_avec);
00180     r->free_feature_vector(bv, idx_b, free_bvec);
00181 
00182     return result;
00183 }
00184 
00185 void CWeightedCommWordStringKernel::add_to_normal(
00186     int32_t vec_idx, float64_t weight)
00187 {
00188     int32_t len=-1;
00189     bool free_vec;
00190     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs;
00191     uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec);
00192 
00193     if (len>0)
00194     {
00195         for (int32_t j=0; j<len; j++)
00196         {
00197             uint8_t mask=0;
00198             int32_t offs=0;
00199             for (int32_t d=0; d<degree; d++)
00200             {
00201                 mask = mask | (1 << (degree-d-1));
00202                 int32_t idx=s->get_masked_symbols(vec[j], mask);
00203                 idx=s->shift_symbol(idx, degree-d-1);
00204                 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
00205                 offs+=s->shift_offset(1,d+1);
00206             }
00207         }
00208 
00209         set_is_initialized(true);
00210     }
00211 
00212     s->free_feature_vector(vec, vec_idx, free_vec);
00213 }
00214 
00215 void CWeightedCommWordStringKernel::merge_normal()
00216 {
00217     ASSERT(get_is_initialized());
00218     ASSERT(use_sign==false);
00219 
00220     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00221     uint32_t num_symbols=(uint32_t) s->get_num_symbols();
00222     int32_t dic_size=1<<(sizeof(uint16_t)*8);
00223     float64_t* dic=new float64_t[dic_size];
00224     memset(dic, 0, sizeof(float64_t)*dic_size);
00225 
00226     for (uint32_t sym=0; sym<num_symbols; sym++)
00227     {
00228         float64_t result=0;
00229         uint8_t mask=0;
00230         int32_t offs=0;
00231         for (int32_t d=0; d<degree; d++)
00232         {
00233             mask = mask | (1 << (degree-d-1));
00234             int32_t idx=s->get_masked_symbols(sym, mask);
00235             idx=s->shift_symbol(idx, degree-d-1);
00236             result += dictionary_weights[offs + idx];
00237             offs+=s->shift_offset(1,d+1);
00238         }
00239         dic[sym]=result;
00240     }
00241 
00242     init_dictionary(1<<(sizeof(uint16_t)*8));
00243     memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
00244     delete[] dic;
00245 }
00246 
00247 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i)
00248 { 
00249     if (!get_is_initialized())
00250         SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00251 
00252     ASSERT(use_sign==false);
00253 
00254     float64_t result=0;
00255     bool free_vec;
00256     int32_t len=-1;
00257     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00258     uint16_t* vec=s->get_feature_vector(i, len, free_vec);
00259 
00260     if (vec && len>0)
00261     {
00262         for (int32_t j=0; j<len; j++)
00263         {
00264             uint8_t mask=0;
00265             int32_t offs=0;
00266             for (int32_t d=0; d<degree; d++)
00267             {
00268                 mask = mask | (1 << (degree-d-1));
00269                 int32_t idx=s->get_masked_symbols(vec[j], mask);
00270                 idx=s->shift_symbol(idx, degree-d-1);
00271                 result += dictionary_weights[offs + idx]*weights[d];
00272                 offs+=s->shift_offset(1,d+1);
00273             }
00274         }
00275 
00276         result=normalizer->normalize_rhs(result, i);
00277     }
00278     s->free_feature_vector(vec, i, free_vec);
00279     return result;
00280 }
00281 
00282 float64_t* CWeightedCommWordStringKernel::compute_scoring(
00283     int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00284     int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00285 {
00286     if (do_init)
00287         CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
00288 
00289     int32_t dic_size=1<<(sizeof(uint16_t)*9);
00290     float64_t* dic=new float64_t[dic_size];
00291     memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
00292 
00293     merge_normal();
00294     float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
00295             num_sym, target, num_suppvec, IDX, alphas, false);
00296 
00297     init_dictionary(1<<(sizeof(uint16_t)*9));
00298     memcpy(dictionary_weights,dic,  sizeof(float64_t)*dic_size);
00299     delete[] dic;
00300 
00301     return result;
00302 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation