00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/ImplicitWeightedSpecFeatures.h"
00012 #include "lib/io.h"
00013
00014 using namespace shogun;
00015
00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00017 {
00018 ASSERT(str);
00019 strings=str;
00020 SG_REF(strings)
00021 normalization_factors=NULL;
00022 spec_weights=NULL;
00023 num_strings = str->get_num_vectors();
00024 alphabet_size = str->get_original_num_symbols();
00025 degree=str->get_order();
00026 set_wd_weights();
00027
00028 SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00029 spec_size, num_strings);
00030
00031 if (normalize)
00032 compute_normalization_const();
00033 }
00034
00035 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00036 {
00037 float64_t* factors=new float64_t[num_strings];
00038
00039 for (int32_t i=0; i<num_strings; i++)
00040 factors[i]=1.0/CMath::sqrt(dot(i,i));
00041
00042 normalization_factors=factors;
00043
00044 }
00045
00046 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00047 {
00048 delete[] spec_weights;
00049 spec_weights=new float64_t[degree];
00050
00051 int32_t i;
00052 float64_t sum=0;
00053 spec_size=0;
00054
00055 for (i=0; i<degree; i++)
00056 {
00057 spec_size+=CMath::pow(alphabet_size, i+1);
00058 spec_weights[i]=degree-i;
00059 sum+=spec_weights[i];
00060 }
00061 for (i=0; i<degree; i++)
00062 spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00063
00064 return spec_weights!=NULL;
00065 }
00066
00067 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00068 {
00069 ASSERT(d==degree);
00070
00071 delete[] spec_weights;
00072 spec_weights=new float64_t[degree];
00073 for (int32_t i=0; i<degree; i++)
00074 spec_weights[i]=CMath::sqrt(w[i]);
00075 return true;
00076 }
00077
00078 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig),
00079 num_strings(orig.num_strings),
00080 alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00081 {
00082 SG_NOTIMPLEMENTED;
00083 SG_REF(strings);
00084 }
00085
00086 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00087 {
00088 SG_UNREF(strings);
00089 delete[] spec_weights;
00090 delete[] normalization_factors;
00091 }
00092
00093 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00094 {
00095 ASSERT(vec_idx1 < num_strings);
00096 ASSERT(vec_idx2 < num_strings);
00097
00098 int32_t len1=-1;
00099 int32_t len2=-1;
00100 bool free_vec1;
00101 bool free_vec2;
00102 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00103 uint16_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00104
00105 float64_t result=0;
00106 uint8_t mask=0;
00107
00108 for (int32_t d=0; d<degree; d++)
00109 {
00110 mask = mask | (1 << (degree-d-1));
00111 uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00112
00113 int32_t left_idx=0;
00114 int32_t right_idx=0;
00115 float64_t weight=spec_weights[d]*spec_weights[d];
00116
00117 while (left_idx < len1 && right_idx < len2)
00118 {
00119 uint16_t lsym=vec1[left_idx] & masked;
00120 uint16_t rsym=vec2[right_idx] & masked;
00121
00122 if (lsym == rsym)
00123 {
00124 int32_t old_left_idx=left_idx;
00125 int32_t old_right_idx=right_idx;
00126
00127 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00128 left_idx++;
00129
00130 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00131 right_idx++;
00132
00133 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00134 }
00135 else if (lsym<rsym)
00136 left_idx++;
00137 else
00138 right_idx++;
00139 }
00140 }
00141
00142 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00143 strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00144
00145 if (normalization_factors)
00146 return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00147 else
00148 return result;
00149 }
00150
00151 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00152 {
00153 ASSERT(vec2_len == spec_size);
00154 ASSERT(vec_idx1 < num_strings);
00155
00156 float64_t result=0;
00157 int32_t len1=-1;
00158 bool free_vec1;
00159 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00160
00161 if (vec1 && len1>0)
00162 {
00163 for (int32_t j=0; j<len1; j++)
00164 {
00165 uint8_t mask=0;
00166 int32_t offs=0;
00167 uint16_t v=*vec1++;
00168
00169 for (int32_t d=0; d<degree; d++)
00170 {
00171 mask = mask | (1 << (degree-d-1));
00172 int32_t idx=strings->get_masked_symbols(v, mask);
00173 idx=strings->shift_symbol(idx, degree-d-1);
00174 result += vec2[offs + idx]*spec_weights[d];
00175 offs+=strings->shift_offset(1,d+1);
00176 }
00177 }
00178
00179 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00180
00181 if (normalization_factors)
00182 result*=normalization_factors[vec_idx1];
00183 }
00184 else
00185 SG_ERROR("huh?\n");
00186
00187 return result;
00188 }
00189
00190 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00191 {
00192 int32_t len1=-1;
00193 bool free_vec1;
00194 uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00195
00196 if (normalization_factors)
00197 alpha*=normalization_factors[vec_idx1];
00198
00199 if (vec && len1>0)
00200 {
00201 for (int32_t j=0; j<len1; j++)
00202 {
00203 uint8_t mask=0;
00204 int32_t offs=0;
00205 for (int32_t d=0; d<degree; d++)
00206 {
00207 mask = mask | (1 << (degree-d-1));
00208 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00209 idx=strings->shift_symbol(idx, degree-d-1);
00210 if (abs_val)
00211 vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00212 else
00213 vec2[offs + idx] += alpha*spec_weights[d];
00214 offs+=strings->shift_offset(1,d+1);
00215 }
00216 }
00217 }
00218
00219 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00220 }
00221
00222 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00223 {
00224 return new CImplicitWeightedSpecFeatures(*this);
00225 }
00226
00227 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index)
00228 {
00229 if (vector_index>=num_strings)
00230 {
00231 SG_ERROR("Index out of bounds (number of strings %d, you "
00232 "requested %d)\n", num_strings, vector_index);
00233 }
00234
00235 wspec_feature_iterator* it=new wspec_feature_iterator[1];
00236 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00237 it->vidx=vector_index;
00238
00239 it->offs=0;
00240 it->d=0;
00241 it->j=0;
00242 it->mask=0;
00243 it->alpha=normalization_factors[vector_index];
00244
00245 return it;
00246 }
00247
00248 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00249 {
00250 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00251
00252 if (it->d>=degree)
00253 {
00254 if (it->j < it->vlen-1)
00255 {
00256 it->j++;
00257 it->d=0;
00258 it->mask=0;
00259 it->offs=0;
00260 }
00261 else
00262 return false;
00263 }
00264
00265 int32_t d=it->d;
00266
00267 it->mask = it->mask | (1 << (degree-d-1));
00268 int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask);
00269 idx=strings->shift_symbol(idx, degree-d-1);
00270 value=it->alpha*spec_weights[d];
00271 index=it->offs + idx;
00272 it->offs+=strings->shift_offset(1,d+1);
00273
00274 it->d=d+1;
00275 return true;
00276 }
00277
00278 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator)
00279 {
00280 ASSERT(iterator);
00281 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00282 strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00283 delete[] it;
00284 }