WDFeatures.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/WDFeatures.h"
00012 #include "lib/io.h"
00013
00014 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00015 int32_t order, int32_t from_order) : CDotFeatures()
00016 {
00017 ASSERT(str);
00018 ASSERT(str->have_same_length());
00019 SG_REF(str);
00020
00021 strings=str;
00022 string_length=str->get_max_vector_length();
00023 num_strings=str->get_num_vectors();
00024 CAlphabet* alpha=str->get_alphabet();
00025 alphabet_size=alpha->get_num_symbols();
00026 SG_UNREF(alpha);
00027
00028 degree=order;
00029 from_degree=from_order;
00030 set_wd_weights();
00031 set_normalization_const();
00032
00033 }
00034
00035 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00036 : CDotFeatures(orig), strings(orig.strings),
00037 degree(orig.degree), from_degree(orig.from_degree)
00038 {
00039 SG_REF(strings);
00040 string_length=strings->get_max_vector_length();
00041 num_strings=strings->get_num_vectors();
00042 CAlphabet* alpha=strings->get_alphabet();
00043 alphabet_size=alpha->get_num_symbols();
00044 SG_UNREF(alpha);
00045
00046 set_wd_weights();
00047 set_normalization_const();
00048 }
00049
00050 CWDFeatures::~CWDFeatures()
00051 {
00052 SG_UNREF(strings);
00053 }
00054
00055 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00056 {
00057 int32_t len1, len2;
00058 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1);
00059 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2);
00060
00061 ASSERT(len1==len2);
00062
00063 float64_t sum=0.0;
00064
00065 for (int32_t i=0; i<len1; i++)
00066 {
00067 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00068 {
00069 if (vec1[i+j]!=vec2[i+j])
00070 break ;
00071 sum += wd_weights[j]*wd_weights[j];
00072 }
00073 }
00074 return sum;
00075 }
00076
00077 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00078 {
00079 if (vec2_len != w_dim)
00080 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00081
00082 float64_t sum=0;
00083 int32_t lim=CMath::min(degree, string_length);
00084 int32_t len;
00085 uint8_t* vec = strings->get_feature_vector(vec_idx1, len);
00086 int32_t* val=new int32_t[len];
00087 CMath::fill_vector(val, len, 0);
00088
00089 int32_t asize=alphabet_size;
00090 int32_t asizem1=1;
00091 int32_t offs=0;
00092
00093 for (int32_t k=0; k<lim; k++)
00094 {
00095 float64_t wd = wd_weights[k];
00096
00097 int32_t o=offs;
00098 for (int32_t i=0; i+k < len; i++)
00099 {
00100 val[i]+=asizem1*vec[i+k];
00101 sum+=vec2[val[i]+o]*wd;
00102 o+=asize;
00103 }
00104 offs+=asize*len;
00105 asize*=alphabet_size;
00106 asizem1*=alphabet_size;
00107 }
00108 delete[] val;
00109 return sum/normalization_const;
00110 }
00111
00112 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00113 {
00114 if (vec2_len != w_dim)
00115 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00116
00117 int32_t lim=CMath::min(degree, string_length);
00118 int32_t len;
00119 uint8_t* vec = strings->get_feature_vector(vec_idx1, len);
00120 int32_t* val=new int32_t[len];
00121 CMath::fill_vector(val, len, 0);
00122
00123 int32_t asize=alphabet_size;
00124 int32_t asizem1=1;
00125 int32_t offs=0;
00126
00127 for (int32_t k=0; k<lim; k++)
00128 {
00129 float64_t wd = alpha*wd_weights[k]/normalization_const;
00130
00131 if (abs_val)
00132 wd=CMath::abs(wd);
00133
00134 int32_t o=offs;
00135 for (int32_t i=0; i+k < len; i++)
00136 {
00137 val[i]+=asizem1*vec[i+k];
00138 vec2[val[i]+o]+=wd;
00139 o+=asize;
00140 }
00141 offs+=asize*len;
00142 asize*=alphabet_size;
00143 asizem1*=alphabet_size;
00144 }
00145 delete[] val;
00146 }
00147
00148 void CWDFeatures::set_wd_weights()
00149 {
00150 ASSERT(degree>0 && degree<=8);
00151 wd_weights=new float64_t[degree];
00152 w_dim=0;
00153
00154 for (int32_t i=0; i<degree; i++)
00155 {
00156 w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00157 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00158 }
00159 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00160 }
00161
00162
00163 void CWDFeatures::set_normalization_const()
00164 {
00165 normalization_const=0;
00166 for (int32_t i=0; i<degree; i++)
00167 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00168
00169 normalization_const=CMath::sqrt(normalization_const);
00170 SG_DEBUG("normalization_const:%f\n", normalization_const);
00171 }
00172
00173 CFeatures* CWDFeatures::duplicate() const
00174 {
00175 return new CWDFeatures(*this);
00176 }