00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/common.h"
00012 #include "kernel/WeightedCommWordStringKernel.h"
00013 #include "features/StringFeatures.h"
00014 #include "lib/io.h"
00015
00016 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00017 int32_t size, bool us)
00018 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00019 {
00020 init_dictionary(1<<(sizeof(uint16_t)*9));
00021 ASSERT(us==false);
00022 }
00023
00024 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00025 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us,
00026 int32_t size)
00027 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00028 {
00029 init_dictionary(1<<(sizeof(uint16_t)*9));
00030 ASSERT(us==false);
00031
00032 init(l,r);
00033 }
00034
00035 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel()
00036 {
00037 delete[] weights;
00038 }
00039
00040 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00041 {
00042 ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
00043 ((CStringFeatures<uint16_t>*) r)->get_order());
00044 degree=((CStringFeatures<uint16_t>*) l)->get_order();
00045 set_wd_weights();
00046
00047 CCommWordStringKernel::init(l,r);
00048 return init_normalizer();
00049 }
00050
00051 void CWeightedCommWordStringKernel::cleanup()
00052 {
00053 delete[] weights;
00054 weights=NULL;
00055
00056 CCommWordStringKernel::cleanup();
00057 }
00058
00059 bool CWeightedCommWordStringKernel::set_wd_weights()
00060 {
00061 delete[] weights;
00062 weights=new float64_t[degree];
00063
00064 int32_t i;
00065 float64_t sum=0;
00066 for (i=0; i<degree; i++)
00067 {
00068 weights[i]=degree-i;
00069 sum+=weights[i];
00070 }
00071 for (i=0; i<degree; i++)
00072 weights[i]=CMath::sqrt(weights[i]/sum);
00073
00074 return weights!=NULL;
00075 }
00076
00077 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d)
00078 {
00079 ASSERT(d==degree);
00080
00081 delete[] weights;
00082 weights=new float64_t[degree];
00083 for (int32_t i=0; i<degree; i++)
00084 weights[i]=CMath::sqrt(w[i]);
00085 return true;
00086 }
00087
00088 float64_t CWeightedCommWordStringKernel::compute_helper(
00089 int32_t idx_a, int32_t idx_b, bool do_sort)
00090 {
00091 int32_t alen, blen;
00092
00093 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00094 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00095
00096 uint16_t* av=l->get_feature_vector(idx_a, alen);
00097 uint16_t* bv=r->get_feature_vector(idx_b, blen);
00098
00099 uint16_t* avec=av;
00100 uint16_t* bvec=bv;
00101
00102 if (do_sort)
00103 {
00104 if (alen>0)
00105 {
00106 avec=new uint16_t[alen];
00107 memcpy(avec, av, sizeof(uint16_t)*alen);
00108 CMath::radix_sort(avec, alen);
00109 }
00110 else
00111 avec=NULL;
00112
00113 if (blen>0)
00114 {
00115 bvec=new uint16_t[blen];
00116 memcpy(bvec, bv, sizeof(uint16_t)*blen);
00117 CMath::radix_sort(bvec, blen);
00118 }
00119 else
00120 bvec=NULL;
00121 }
00122 else
00123 {
00124 if ( (l->get_num_preproc() != l->get_num_preprocessed()) ||
00125 (r->get_num_preproc() != r->get_num_preprocessed()))
00126 {
00127 SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00128 " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(),
00129 r->get_num_preprocessed(), r->get_num_preproc());
00130 }
00131 }
00132
00133 float64_t result=0;
00134 uint8_t mask=0;
00135
00136 for (int32_t d=0; d<degree; d++)
00137 {
00138 mask = mask | (1 << (degree-d-1));
00139 uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
00140
00141 int32_t left_idx=0;
00142 int32_t right_idx=0;
00143 float64_t weight=weights[d]*weights[d];
00144
00145 while (left_idx < alen && right_idx < blen)
00146 {
00147 uint16_t lsym=avec[left_idx] & masked;
00148 uint16_t rsym=bvec[right_idx] & masked;
00149
00150 if (lsym == rsym)
00151 {
00152 int32_t old_left_idx=left_idx;
00153 int32_t old_right_idx=right_idx;
00154
00155 while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
00156 left_idx++;
00157
00158 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
00159 right_idx++;
00160
00161 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00162 }
00163 else if (lsym<rsym)
00164 left_idx++;
00165 else
00166 right_idx++;
00167 }
00168 }
00169
00170 if (do_sort)
00171 {
00172 delete[] avec;
00173 delete[] bvec;
00174 }
00175
00176 return result;
00177 }
00178
00179 void CWeightedCommWordStringKernel::add_to_normal(
00180 int32_t vec_idx, float64_t weight)
00181 {
00182 int32_t len=-1;
00183 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs;
00184 uint16_t* vec=s->get_feature_vector(vec_idx, len);
00185
00186 if (len>0)
00187 {
00188 for (int32_t j=0; j<len; j++)
00189 {
00190 uint8_t mask=0;
00191 int32_t offs=0;
00192 for (int32_t d=0; d<degree; d++)
00193 {
00194 mask = mask | (1 << (degree-d-1));
00195 int32_t idx=s->get_masked_symbols(vec[j], mask);
00196 idx=s->shift_symbol(idx, degree-d-1);
00197 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
00198 offs+=s->shift_offset(1,d+1);
00199 }
00200 }
00201
00202 set_is_initialized(true);
00203 }
00204 }
00205
00206 void CWeightedCommWordStringKernel::merge_normal()
00207 {
00208 ASSERT(get_is_initialized());
00209 ASSERT(use_sign==false);
00210
00211 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00212 uint32_t num_symbols=(uint32_t) s->get_num_symbols();
00213 int32_t dic_size=1<<(sizeof(uint16_t)*8);
00214 float64_t* dic=new float64_t[dic_size];
00215 memset(dic, 0, sizeof(float64_t)*dic_size);
00216
00217 for (uint32_t sym=0; sym<num_symbols; sym++)
00218 {
00219 float64_t result=0;
00220 uint8_t mask=0;
00221 int32_t offs=0;
00222 for (int32_t d=0; d<degree; d++)
00223 {
00224 mask = mask | (1 << (degree-d-1));
00225 int32_t idx=s->get_masked_symbols(sym, mask);
00226 idx=s->shift_symbol(idx, degree-d-1);
00227 result += dictionary_weights[offs + idx];
00228 offs+=s->shift_offset(1,d+1);
00229 }
00230 dic[sym]=result;
00231 }
00232
00233 init_dictionary(1<<(sizeof(uint16_t)*8));
00234 memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
00235 delete[] dic;
00236 }
00237
00238 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i)
00239 {
00240 if (!get_is_initialized())
00241 SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00242
00243 ASSERT(use_sign==false);
00244
00245 float64_t result=0;
00246 int32_t len=-1;
00247 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00248 uint16_t* vec=s->get_feature_vector(i, len);
00249
00250 if (vec && len>0)
00251 {
00252 for (int32_t j=0; j<len; j++)
00253 {
00254 uint8_t mask=0;
00255 int32_t offs=0;
00256 for (int32_t d=0; d<degree; d++)
00257 {
00258 mask = mask | (1 << (degree-d-1));
00259 int32_t idx=s->get_masked_symbols(vec[j], mask);
00260 idx=s->shift_symbol(idx, degree-d-1);
00261 result += dictionary_weights[offs + idx]*weights[d];
00262 offs+=s->shift_offset(1,d+1);
00263 }
00264 }
00265
00266 result=normalizer->normalize_rhs(result, i);
00267 }
00268 return result;
00269 }
00270
00271 float64_t* CWeightedCommWordStringKernel::compute_scoring(
00272 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00273 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00274 {
00275 if (do_init)
00276 CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
00277
00278 int32_t dic_size=1<<(sizeof(uint16_t)*9);
00279 float64_t* dic=new float64_t[dic_size];
00280 memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
00281
00282 merge_normal();
00283 float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
00284 num_sym, target, num_suppvec, IDX, alphas, false);
00285
00286 init_dictionary(1<<(sizeof(uint16_t)*9));
00287 memcpy(dictionary_weights,dic, sizeof(float64_t)*dic_size);
00288 delete[] dic;
00289
00290 return result;
00291 }