WeightedCommWordStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "lib/common.h"
00012 #include "kernel/WeightedCommWordStringKernel.h"
00013 #include "features/StringFeatures.h"
00014 #include "lib/io.h"
00015 
00016 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00017     int32_t size, bool us)
00018 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00019 {
00020     init_dictionary(1<<(sizeof(uint16_t)*9));
00021     ASSERT(us==false);
00022 }
00023 
00024 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00025     CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us,
00026     int32_t size)
00027 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00028 {
00029     init_dictionary(1<<(sizeof(uint16_t)*9));
00030     ASSERT(us==false);
00031 
00032     init(l,r);
00033 }
00034 
00035 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel()
00036 {
00037     delete[] weights;
00038 }
00039 
00040 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00041 {
00042     ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
00043             ((CStringFeatures<uint16_t>*) r)->get_order());
00044     degree=((CStringFeatures<uint16_t>*) l)->get_order();
00045     set_wd_weights();
00046 
00047     CCommWordStringKernel::init(l,r);
00048     return init_normalizer();
00049 }
00050 
00051 void CWeightedCommWordStringKernel::cleanup()
00052 {
00053     delete[] weights;
00054     weights=NULL;
00055 
00056     CCommWordStringKernel::cleanup();
00057 }
00058 
00059 bool CWeightedCommWordStringKernel::set_wd_weights()
00060 {
00061     delete[] weights;
00062     weights=new float64_t[degree];
00063 
00064     int32_t i;
00065     float64_t sum=0;
00066     for (i=0; i<degree; i++)
00067     {
00068         weights[i]=degree-i;
00069         sum+=weights[i];
00070     }
00071     for (i=0; i<degree; i++)
00072         weights[i]=CMath::sqrt(weights[i]/sum);
00073 
00074     return weights!=NULL;
00075 }
00076 
00077 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d)
00078 {
00079     ASSERT(d==degree);
00080 
00081     delete[] weights;
00082     weights=new float64_t[degree];
00083     for (int32_t i=0; i<degree; i++)
00084         weights[i]=CMath::sqrt(w[i]);
00085     return true;
00086 }
00087   
00088 float64_t CWeightedCommWordStringKernel::compute_helper(
00089     int32_t idx_a, int32_t idx_b, bool do_sort)
00090 {
00091     int32_t alen, blen;
00092 
00093     CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00094     CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00095 
00096     uint16_t* av=l->get_feature_vector(idx_a, alen);
00097     uint16_t* bv=r->get_feature_vector(idx_b, blen);
00098 
00099     uint16_t* avec=av;
00100     uint16_t* bvec=bv;
00101 
00102     if (do_sort)
00103     {
00104         if (alen>0)
00105         {
00106             avec=new uint16_t[alen];
00107             memcpy(avec, av, sizeof(uint16_t)*alen);
00108             CMath::radix_sort(avec, alen);
00109         }
00110         else
00111             avec=NULL;
00112 
00113         if (blen>0)
00114         {
00115             bvec=new uint16_t[blen];
00116             memcpy(bvec, bv, sizeof(uint16_t)*blen);
00117             CMath::radix_sort(bvec, blen);
00118         }
00119         else
00120             bvec=NULL;
00121     }
00122     else
00123     {
00124         if ( (l->get_num_preproc() != l->get_num_preprocessed()) ||
00125                 (r->get_num_preproc() != r->get_num_preprocessed()))
00126         {
00127             SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00128                     " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(),
00129                     r->get_num_preprocessed(), r->get_num_preproc());
00130         }
00131     }
00132 
00133     float64_t result=0;
00134     uint8_t mask=0;
00135 
00136     for (int32_t d=0; d<degree; d++)
00137     {
00138         mask = mask | (1 << (degree-d-1));
00139         uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
00140 
00141         int32_t left_idx=0;
00142         int32_t right_idx=0;
00143         float64_t weight=weights[d]*weights[d];
00144 
00145         while (left_idx < alen && right_idx < blen)
00146         {
00147             uint16_t lsym=avec[left_idx] & masked;
00148             uint16_t rsym=bvec[right_idx] & masked;
00149 
00150             if (lsym == rsym)
00151             {
00152                 int32_t old_left_idx=left_idx;
00153                 int32_t old_right_idx=right_idx;
00154 
00155                 while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
00156                     left_idx++;
00157 
00158                 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
00159                     right_idx++;
00160 
00161                 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00162             }
00163             else if (lsym<rsym)
00164                 left_idx++;
00165             else
00166                 right_idx++;
00167         }
00168     }
00169 
00170     if (do_sort)
00171     {
00172         delete[] avec;
00173         delete[] bvec;
00174     }
00175 
00176     return result;
00177 }
00178 
00179 void CWeightedCommWordStringKernel::add_to_normal(
00180     int32_t vec_idx, float64_t weight)
00181 {
00182     int32_t len=-1;
00183     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs;
00184     uint16_t* vec=s->get_feature_vector(vec_idx, len);
00185 
00186     if (len>0)
00187     {
00188         for (int32_t j=0; j<len; j++)
00189         {
00190             uint8_t mask=0;
00191             int32_t offs=0;
00192             for (int32_t d=0; d<degree; d++)
00193             {
00194                 mask = mask | (1 << (degree-d-1));
00195                 int32_t idx=s->get_masked_symbols(vec[j], mask);
00196                 idx=s->shift_symbol(idx, degree-d-1);
00197                 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
00198                 offs+=s->shift_offset(1,d+1);
00199             }
00200         }
00201 
00202         set_is_initialized(true);
00203     }
00204 }
00205 
00206 void CWeightedCommWordStringKernel::merge_normal()
00207 {
00208     ASSERT(get_is_initialized());
00209     ASSERT(use_sign==false);
00210 
00211     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00212     uint32_t num_symbols=(uint32_t) s->get_num_symbols();
00213     int32_t dic_size=1<<(sizeof(uint16_t)*8);
00214     float64_t* dic=new float64_t[dic_size];
00215     memset(dic, 0, sizeof(float64_t)*dic_size);
00216 
00217     for (uint32_t sym=0; sym<num_symbols; sym++)
00218     {
00219         float64_t result=0;
00220         uint8_t mask=0;
00221         int32_t offs=0;
00222         for (int32_t d=0; d<degree; d++)
00223         {
00224             mask = mask | (1 << (degree-d-1));
00225             int32_t idx=s->get_masked_symbols(sym, mask);
00226             idx=s->shift_symbol(idx, degree-d-1);
00227             result += dictionary_weights[offs + idx];
00228             offs+=s->shift_offset(1,d+1);
00229         }
00230         dic[sym]=result;
00231     }
00232 
00233     init_dictionary(1<<(sizeof(uint16_t)*8));
00234     memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
00235     delete[] dic;
00236 }
00237 
00238 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i)
00239 { 
00240     if (!get_is_initialized())
00241         SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00242 
00243     ASSERT(use_sign==false);
00244 
00245     float64_t result=0;
00246     int32_t len=-1;
00247     CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00248     uint16_t* vec=s->get_feature_vector(i, len);
00249 
00250     if (vec && len>0)
00251     {
00252         for (int32_t j=0; j<len; j++)
00253         {
00254             uint8_t mask=0;
00255             int32_t offs=0;
00256             for (int32_t d=0; d<degree; d++)
00257             {
00258                 mask = mask | (1 << (degree-d-1));
00259                 int32_t idx=s->get_masked_symbols(vec[j], mask);
00260                 idx=s->shift_symbol(idx, degree-d-1);
00261                 result += dictionary_weights[offs + idx]*weights[d];
00262                 offs+=s->shift_offset(1,d+1);
00263             }
00264         }
00265 
00266         result=normalizer->normalize_rhs(result, i);
00267     }
00268     return result;
00269 }
00270 
00271 float64_t* CWeightedCommWordStringKernel::compute_scoring(
00272     int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00273     int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00274 {
00275     if (do_init)
00276         CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
00277 
00278     int32_t dic_size=1<<(sizeof(uint16_t)*9);
00279     float64_t* dic=new float64_t[dic_size];
00280     memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
00281 
00282     merge_normal();
00283     float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
00284             num_sym, target, num_suppvec, IDX, alphas, false);
00285 
00286     init_dictionary(1<<(sizeof(uint16_t)*9));
00287     memcpy(dictionary_weights,dic,  sizeof(float64_t)*dic_size);
00288     delete[] dic;
00289 
00290     return result;
00291 }

SHOGUN Machine Learning Toolbox - Documentation