WDFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "features/WDFeatures.h"
00012 #include "lib/io.h"
00013 
00014 using namespace shogun;
00015 
00016 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00017         int32_t order, int32_t from_order) : CDotFeatures()
00018 {
00019     ASSERT(str);
00020     ASSERT(str->have_same_length());
00021     SG_REF(str);
00022 
00023     strings=str;
00024     string_length=str->get_max_vector_length();
00025     num_strings=str->get_num_vectors();
00026     CAlphabet* alpha=str->get_alphabet();
00027     alphabet_size=alpha->get_num_symbols();
00028     SG_UNREF(alpha);
00029 
00030     degree=order;
00031     from_degree=from_order;
00032     wd_weights=NULL;
00033     set_wd_weights();
00034     set_normalization_const();
00035 
00036 }
00037 
00038 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00039     : CDotFeatures(orig), strings(orig.strings),
00040     degree(orig.degree), from_degree(orig.from_degree),
00041     normalization_const(orig.normalization_const)
00042 {
00043     SG_REF(strings);
00044     string_length=strings->get_max_vector_length();
00045     num_strings=strings->get_num_vectors();
00046     CAlphabet* alpha=strings->get_alphabet();
00047     alphabet_size=alpha->get_num_symbols();
00048     SG_UNREF(alpha);
00049 
00050     wd_weights=NULL;
00051     set_wd_weights();
00052 }
00053 
00054 CWDFeatures::~CWDFeatures()
00055 {
00056     SG_UNREF(strings);
00057     delete[] wd_weights;
00058 }
00059 
00060 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00061 {
00062     int32_t len1, len2;
00063     bool free_vec1, free_vec2;
00064 
00065     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00066     uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00067 
00068     ASSERT(len1==len2);
00069 
00070     float64_t sum=0.0;
00071 
00072     for (int32_t i=0; i<len1; i++)
00073     {
00074         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00075         {
00076             if (vec1[i+j]!=vec2[i+j])
00077                 break ;
00078             sum += wd_weights[j]*wd_weights[j];
00079         }
00080     }
00081     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00082     strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00083     return sum/CMath::sq(normalization_const);
00084 }
00085 
00086 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00087 {
00088     if (vec2_len != w_dim)
00089         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00090 
00091     float64_t sum=0;
00092     int32_t lim=CMath::min(degree, string_length);
00093     int32_t len;
00094     bool free_vec1;
00095     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00096     int32_t* val=new int32_t[len];
00097     CMath::fill_vector(val, len, 0);
00098 
00099     int32_t asize=alphabet_size;
00100     int32_t asizem1=1;
00101     int32_t offs=0;
00102 
00103     for (int32_t k=0; k<lim; k++)
00104     {
00105         float64_t wd = wd_weights[k];
00106 
00107         int32_t o=offs;
00108         for (int32_t i=0; i+k < len; i++) 
00109         {
00110             val[i]+=asizem1*vec[i+k];
00111             sum+=vec2[val[i]+o]*wd;
00112             o+=asize;
00113         }
00114         offs+=asize*len;
00115         asize*=alphabet_size;
00116         asizem1*=alphabet_size;
00117     }
00118     delete[] val;
00119     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00120 
00121     return sum/normalization_const;
00122 }
00123 
00124 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00125 {
00126     if (vec2_len != w_dim)
00127         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00128 
00129     int32_t lim=CMath::min(degree, string_length);
00130     int32_t len;
00131     bool free_vec1;
00132     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00133     int32_t* val=new int32_t[len];
00134     CMath::fill_vector(val, len, 0);
00135 
00136     int32_t asize=alphabet_size;
00137     int32_t asizem1=1;
00138     int32_t offs=0;
00139 
00140     for (int32_t k=0; k<lim; k++)
00141     {
00142         float64_t wd = alpha*wd_weights[k]/normalization_const;
00143 
00144         if (abs_val)
00145             wd=CMath::abs(wd);
00146 
00147         int32_t o=offs;
00148         for (int32_t i=0; i+k < len; i++) 
00149         {
00150             val[i]+=asizem1*vec[i+k];
00151             vec2[val[i]+o]+=wd;
00152             o+=asize;
00153         }
00154         offs+=asize*len;
00155         asize*=alphabet_size;
00156         asizem1*=alphabet_size;
00157     }
00158     delete[] val;
00159 
00160     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00161 }
00162 
00163 void CWDFeatures::set_wd_weights()
00164 {
00165     ASSERT(degree>0 && degree<=8);
00166     delete[] wd_weights;
00167     wd_weights=new float64_t[degree];
00168     w_dim=0;
00169 
00170     for (int32_t i=0; i<degree; i++)
00171     {
00172         w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00173         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00174     }
00175     SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00176 }
00177 
00178 
00179 void CWDFeatures::set_normalization_const(float64_t n)
00180 {
00181     if (n==0)
00182     {
00183         normalization_const=0;
00184         for (int32_t i=0; i<degree; i++)
00185             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00186 
00187         normalization_const=CMath::sqrt(normalization_const);
00188     }
00189     else
00190         normalization_const=n;
00191 
00192     SG_DEBUG("normalization_const:%f\n", normalization_const);
00193 }
00194 
00195 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
00196 {
00197     if (vector_index>=num_strings)
00198     {
00199         SG_ERROR("Index out of bounds (number of strings %d, you "
00200                 "requested %d)\n", num_strings, vector_index);
00201     }
00202 
00203     wd_feature_iterator* it=new wd_feature_iterator[1];
00204 
00205     it->lim=CMath::min(degree, string_length);
00206     it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00207     it->vidx=vector_index;
00208 
00209     it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00210     it->val=new int32_t[it->vlen];
00211     CMath::fill_vector(it->val, it->vlen, 0);
00212 
00213     it->asize=alphabet_size;
00214     it->asizem1=1;
00215     it->offs=0;
00216     it->k=0;
00217     it->i=0;
00218     it->o=0;
00219 
00220     return it;
00221 }
00222 
00223 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00224 {
00225     wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00226 
00227     if (it->i + it->k >= it->vlen)
00228     {
00229         if (it->k < it->lim-1)
00230         {
00231             it->offs+=it->asize*it->vlen;
00232             it->asize*=alphabet_size;
00233             it->asizem1*=alphabet_size;
00234             it->k++;
00235             it->i=0;
00236             it->o=it->offs;
00237         }
00238         else
00239             return false;
00240     }
00241 
00242     int32_t i=it->i;
00243     int32_t k=it->k;
00244 #ifdef DEBUG_WDFEATURES
00245     SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1);
00246 #endif
00247 
00248     it->val[i]+=it->asizem1*it->vec[i+k];
00249     value=wd_weights[k]/normalization_const;
00250     index=it->val[i]+it->o;
00251 #ifdef DEBUG_WDFEATURES
00252     SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen);
00253 #endif
00254 
00255     it->o+=it->asize;
00256     it->i=i+1;
00257 
00258     return true;
00259 }
00260 
00261 void CWDFeatures::free_feature_iterator(void* iterator)
00262 {
00263     ASSERT(iterator);
00264     wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00265     strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00266     delete[] it->val;
00267     delete[] it;
00268 }
00269 
00270 CFeatures* CWDFeatures::duplicate() const
00271 {
00272     return new CWDFeatures(*this);
00273 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation