WDFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "features/WDFeatures.h"
00012 #include "lib/io.h"
00013 
00014 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00015         int32_t order, int32_t from_order) : CDotFeatures()
00016 {
00017     ASSERT(str);
00018     ASSERT(str->have_same_length());
00019     SG_REF(str);
00020 
00021     strings=str;
00022     string_length=str->get_max_vector_length();
00023     num_strings=str->get_num_vectors();
00024     CAlphabet* alpha=str->get_alphabet();
00025     alphabet_size=alpha->get_num_symbols();
00026     SG_UNREF(alpha);
00027 
00028     degree=order;
00029     from_degree=from_order;
00030     set_wd_weights();
00031     set_normalization_const();
00032 
00033 }
00034 
00035 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00036     : CDotFeatures(orig), strings(orig.strings),
00037     degree(orig.degree), from_degree(orig.from_degree)
00038 {
00039     SG_REF(strings);
00040     string_length=strings->get_max_vector_length();
00041     num_strings=strings->get_num_vectors();
00042     CAlphabet* alpha=strings->get_alphabet();
00043     alphabet_size=alpha->get_num_symbols();
00044     SG_UNREF(alpha);
00045 
00046     set_wd_weights();
00047     set_normalization_const();
00048 }
00049 
00050 CWDFeatures::~CWDFeatures()
00051 {
00052     SG_UNREF(strings);
00053 }
00054 
00055 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00056 {
00057     int32_t len1, len2;
00058     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1);
00059     uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2);
00060 
00061     ASSERT(len1==len2);
00062 
00063     float64_t sum=0.0;
00064 
00065     for (int32_t i=0; i<len1; i++)
00066     {
00067         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00068         {
00069             if (vec1[i+j]!=vec2[i+j])
00070                 break ;
00071             sum += wd_weights[j]*wd_weights[j];
00072         }
00073     }
00074     return sum;
00075 }
00076 
00077 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00078 {
00079     if (vec2_len != w_dim)
00080         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00081 
00082     float64_t sum=0;
00083     int32_t lim=CMath::min(degree, string_length);
00084     int32_t len;
00085     uint8_t* vec = strings->get_feature_vector(vec_idx1, len);
00086     int32_t* val=new int32_t[len];
00087     CMath::fill_vector(val, len, 0);
00088 
00089     int32_t asize=alphabet_size;
00090     int32_t asizem1=1;
00091     int32_t offs=0;
00092 
00093     for (int32_t k=0; k<lim; k++)
00094     {
00095         float64_t wd = wd_weights[k];
00096 
00097         int32_t o=offs;
00098         for (int32_t i=0; i+k < len; i++) 
00099         {
00100             val[i]+=asizem1*vec[i+k];
00101             sum+=vec2[val[i]+o]*wd;
00102             o+=asize;
00103         }
00104         offs+=asize*len;
00105         asize*=alphabet_size;
00106         asizem1*=alphabet_size;
00107     }
00108     delete[] val;
00109     return sum/normalization_const;
00110 }
00111 
00112 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00113 {
00114     if (vec2_len != w_dim)
00115         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00116 
00117     int32_t lim=CMath::min(degree, string_length);
00118     int32_t len;
00119     uint8_t* vec = strings->get_feature_vector(vec_idx1, len);
00120     int32_t* val=new int32_t[len];
00121     CMath::fill_vector(val, len, 0);
00122 
00123     int32_t asize=alphabet_size;
00124     int32_t asizem1=1;
00125     int32_t offs=0;
00126 
00127     for (int32_t k=0; k<lim; k++)
00128     {
00129         float64_t wd = alpha*wd_weights[k]/normalization_const;
00130 
00131         if (abs_val)
00132             wd=CMath::abs(wd);
00133 
00134         int32_t o=offs;
00135         for (int32_t i=0; i+k < len; i++) 
00136         {
00137             val[i]+=asizem1*vec[i+k];
00138             vec2[val[i]+o]+=wd;
00139             o+=asize;
00140         }
00141         offs+=asize*len;
00142         asize*=alphabet_size;
00143         asizem1*=alphabet_size;
00144     }
00145     delete[] val;
00146 }
00147 
00148 void CWDFeatures::set_wd_weights()
00149 {
00150     ASSERT(degree>0 && degree<=8);
00151     wd_weights=new float64_t[degree];
00152     w_dim=0;
00153 
00154     for (int32_t i=0; i<degree; i++)
00155     {
00156         w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00157         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00158     }
00159     SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00160 }
00161 
00162 
00163 void CWDFeatures::set_normalization_const()
00164 {
00165     normalization_const=0;
00166     for (int32_t i=0; i<degree; i++)
00167         normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00168 
00169     normalization_const=CMath::sqrt(normalization_const);
00170     SG_DEBUG("normalization_const:%f\n", normalization_const);
00171 }
00172 
00173 CFeatures* CWDFeatures::duplicate() const
00174 {
00175     return new CWDFeatures(*this);
00176 }

SHOGUN Machine Learning Toolbox - Documentation