00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/WDFeatures.h"
00012 #include "lib/io.h"
00013
00014 using namespace shogun;
00015
00016 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00017 int32_t order, int32_t from_order) : CDotFeatures()
00018 {
00019 ASSERT(str);
00020 ASSERT(str->have_same_length());
00021 SG_REF(str);
00022
00023 strings=str;
00024 string_length=str->get_max_vector_length();
00025 num_strings=str->get_num_vectors();
00026 CAlphabet* alpha=str->get_alphabet();
00027 alphabet_size=alpha->get_num_symbols();
00028 SG_UNREF(alpha);
00029
00030 degree=order;
00031 from_degree=from_order;
00032 wd_weights=NULL;
00033 set_wd_weights();
00034 set_normalization_const();
00035
00036 }
00037
00038 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00039 : CDotFeatures(orig), strings(orig.strings),
00040 degree(orig.degree), from_degree(orig.from_degree),
00041 normalization_const(orig.normalization_const)
00042 {
00043 SG_REF(strings);
00044 string_length=strings->get_max_vector_length();
00045 num_strings=strings->get_num_vectors();
00046 CAlphabet* alpha=strings->get_alphabet();
00047 alphabet_size=alpha->get_num_symbols();
00048 SG_UNREF(alpha);
00049
00050 wd_weights=NULL;
00051 set_wd_weights();
00052 }
00053
00054 CWDFeatures::~CWDFeatures()
00055 {
00056 SG_UNREF(strings);
00057 delete[] wd_weights;
00058 }
00059
00060 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00061 {
00062 int32_t len1, len2;
00063 bool free_vec1, free_vec2;
00064
00065 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00066 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00067
00068 ASSERT(len1==len2);
00069
00070 float64_t sum=0.0;
00071
00072 for (int32_t i=0; i<len1; i++)
00073 {
00074 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00075 {
00076 if (vec1[i+j]!=vec2[i+j])
00077 break ;
00078 sum += wd_weights[j]*wd_weights[j];
00079 }
00080 }
00081 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00082 strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00083 return sum/CMath::sq(normalization_const);
00084 }
00085
00086 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00087 {
00088 if (vec2_len != w_dim)
00089 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00090
00091 float64_t sum=0;
00092 int32_t lim=CMath::min(degree, string_length);
00093 int32_t len;
00094 bool free_vec1;
00095 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00096 int32_t* val=new int32_t[len];
00097 CMath::fill_vector(val, len, 0);
00098
00099 int32_t asize=alphabet_size;
00100 int32_t asizem1=1;
00101 int32_t offs=0;
00102
00103 for (int32_t k=0; k<lim; k++)
00104 {
00105 float64_t wd = wd_weights[k];
00106
00107 int32_t o=offs;
00108 for (int32_t i=0; i+k < len; i++)
00109 {
00110 val[i]+=asizem1*vec[i+k];
00111 sum+=vec2[val[i]+o]*wd;
00112 o+=asize;
00113 }
00114 offs+=asize*len;
00115 asize*=alphabet_size;
00116 asizem1*=alphabet_size;
00117 }
00118 delete[] val;
00119 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00120
00121 return sum/normalization_const;
00122 }
00123
00124 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00125 {
00126 if (vec2_len != w_dim)
00127 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00128
00129 int32_t lim=CMath::min(degree, string_length);
00130 int32_t len;
00131 bool free_vec1;
00132 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00133 int32_t* val=new int32_t[len];
00134 CMath::fill_vector(val, len, 0);
00135
00136 int32_t asize=alphabet_size;
00137 int32_t asizem1=1;
00138 int32_t offs=0;
00139
00140 for (int32_t k=0; k<lim; k++)
00141 {
00142 float64_t wd = alpha*wd_weights[k]/normalization_const;
00143
00144 if (abs_val)
00145 wd=CMath::abs(wd);
00146
00147 int32_t o=offs;
00148 for (int32_t i=0; i+k < len; i++)
00149 {
00150 val[i]+=asizem1*vec[i+k];
00151 vec2[val[i]+o]+=wd;
00152 o+=asize;
00153 }
00154 offs+=asize*len;
00155 asize*=alphabet_size;
00156 asizem1*=alphabet_size;
00157 }
00158 delete[] val;
00159
00160 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00161 }
00162
00163 void CWDFeatures::set_wd_weights()
00164 {
00165 ASSERT(degree>0 && degree<=8);
00166 delete[] wd_weights;
00167 wd_weights=new float64_t[degree];
00168 w_dim=0;
00169
00170 for (int32_t i=0; i<degree; i++)
00171 {
00172 w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00173 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00174 }
00175 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00176 }
00177
00178
00179 void CWDFeatures::set_normalization_const(float64_t n)
00180 {
00181 if (n==0)
00182 {
00183 normalization_const=0;
00184 for (int32_t i=0; i<degree; i++)
00185 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00186
00187 normalization_const=CMath::sqrt(normalization_const);
00188 }
00189 else
00190 normalization_const=n;
00191
00192 SG_DEBUG("normalization_const:%f\n", normalization_const);
00193 }
00194
00195 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
00196 {
00197 if (vector_index>=num_strings)
00198 {
00199 SG_ERROR("Index out of bounds (number of strings %d, you "
00200 "requested %d)\n", num_strings, vector_index);
00201 }
00202
00203 wd_feature_iterator* it=new wd_feature_iterator[1];
00204
00205 it->lim=CMath::min(degree, string_length);
00206 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00207 it->vidx=vector_index;
00208
00209 it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00210 it->val=new int32_t[it->vlen];
00211 CMath::fill_vector(it->val, it->vlen, 0);
00212
00213 it->asize=alphabet_size;
00214 it->asizem1=1;
00215 it->offs=0;
00216 it->k=0;
00217 it->i=0;
00218 it->o=0;
00219
00220 return it;
00221 }
00222
00223 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00224 {
00225 wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00226
00227 if (it->i + it->k >= it->vlen)
00228 {
00229 if (it->k < it->lim-1)
00230 {
00231 it->offs+=it->asize*it->vlen;
00232 it->asize*=alphabet_size;
00233 it->asizem1*=alphabet_size;
00234 it->k++;
00235 it->i=0;
00236 it->o=it->offs;
00237 }
00238 else
00239 return false;
00240 }
00241
00242 int32_t i=it->i;
00243 int32_t k=it->k;
00244 #ifdef DEBUG_WDFEATURES
00245 SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1);
00246 #endif
00247
00248 it->val[i]+=it->asizem1*it->vec[i+k];
00249 value=wd_weights[k]/normalization_const;
00250 index=it->val[i]+it->o;
00251 #ifdef DEBUG_WDFEATURES
00252 SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen);
00253 #endif
00254
00255 it->o+=it->asize;
00256 it->i=i+1;
00257
00258 return true;
00259 }
00260
00261 void CWDFeatures::free_feature_iterator(void* iterator)
00262 {
00263 ASSERT(iterator);
00264 wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00265 strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00266 delete[] it->val;
00267 delete[] it;
00268 }
00269
00270 CFeatures* CWDFeatures::duplicate() const
00271 {
00272 return new CWDFeatures(*this);
00273 }