00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/HashedWDFeaturesTransposed.h"
00012 #include "lib/io.h"
00013 #include "lib/Signal.h"
00014 #include "base/Parallel.h"
00015
00016 #ifndef WIN32
00017 #include <pthread.h>
00018 #endif
00019
00020 using namespace shogun;
00021
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023 struct HASHEDWD_THREAD_PARAM
00024 {
00025 CHashedWDFeaturesTransposed* hf;
00026 int32_t* sub_index;
00027 float64_t* output;
00028 int32_t start;
00029 int32_t stop;
00030 float64_t* alphas;
00031 float64_t* vec;
00032 float64_t bias;
00033 bool progress;
00034 uint32_t* index;
00035 };
00036 #endif // DOXYGEN_SHOULD_SKIP_THIS
00037
00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str,
00039 int32_t start_order, int32_t order, int32_t from_order,
00040 int32_t hash_bits) : CDotFeatures()
00041 {
00042 ASSERT(start_order>=0);
00043 ASSERT(start_order<order);
00044 ASSERT(order<=from_order);
00045 ASSERT(hash_bits>0);
00046 ASSERT(str);
00047 ASSERT(str->have_same_length());
00048 SG_REF(str);
00049
00050 strings=str;
00051 int32_t transposed_num_feat=0;
00052 int32_t transposed_num_vec=0;
00053 transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
00054
00055 string_length=str->get_max_vector_length();
00056 num_strings=str->get_num_vectors();
00057 ASSERT(transposed_num_feat==num_strings);
00058 ASSERT(transposed_num_vec==string_length);
00059
00060 CAlphabet* alpha=str->get_alphabet();
00061 alphabet_size=alpha->get_num_symbols();
00062 SG_UNREF(alpha);
00063
00064 degree=order;
00065 start_degree=start_order;
00066 if (start_degree!=0)
00067 SG_NOTIMPLEMENTED;
00068 from_degree=from_order;
00069 m_hash_bits=hash_bits;
00070 set_wd_weights();
00071 set_normalization_const();
00072 }
00073
00074 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig)
00075 : CDotFeatures(orig), strings(orig.strings), transposed_strings(transposed_strings),
00076 degree(orig.degree), start_degree(orig.start_degree),
00077 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00078 normalization_const(orig.normalization_const)
00079 {
00080 SG_REF(strings);
00081 string_length=strings->get_max_vector_length();
00082 num_strings=strings->get_num_vectors();
00083 CAlphabet* alpha=strings->get_alphabet();
00084 alphabet_size=alpha->get_num_symbols();
00085 SG_UNREF(alpha);
00086
00087 set_wd_weights();
00088 }
00089
00090 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed()
00091 {
00092 for (int32_t i=0; i<string_length; i++)
00093 delete[] transposed_strings[i].string;
00094 delete[] transposed_strings;
00095
00096 SG_UNREF(strings);
00097 delete[] wd_weights;
00098 }
00099
00100 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, int32_t vec_idx2)
00101 {
00102 int32_t len1, len2;
00103 bool free_vec1, free_vec2;
00104
00105 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00106 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00107
00108 ASSERT(len1==len2);
00109
00110 float64_t sum=0.0;
00111
00112 for (int32_t i=0; i<len1; i++)
00113 {
00114 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00115 {
00116 if (vec1[i+j]!=vec2[i+j])
00117 break;
00118 if (j>=start_degree)
00119 sum += wd_weights[j]*wd_weights[j];
00120 }
00121 }
00122 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00123 strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00124 return sum/CMath::sq(normalization_const);
00125 }
00126
00127 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00128 {
00129 if (vec2_len != w_dim)
00130 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00131
00132 float64_t sum=0;
00133 int32_t len;
00134 bool free_vec1;
00135 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00136 uint32_t* val=new uint32_t[len];
00137
00138 uint32_t offs=0;
00139
00140 CMath::fill_vector(val, len, 0xDEADBEAF);
00141
00142 for (int32_t i=0; i < len; i++)
00143 {
00144 uint32_t o=offs;
00145 for (int32_t k=0; k<degree && i+k<len; k++)
00146 {
00147 const float64_t wd = wd_weights[k];
00148 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00149 val[i]=h;
00150 #ifdef DEBUG_HASHEDWD
00151 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00152 #endif
00153 sum+=vec2[o+(h & mask)]*wd;
00154 o+=partial_w_dim;
00155 }
00156 offs+=partial_w_dim*degree;
00157 }
00158 delete[] val;
00159 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00160
00161 return sum/normalization_const;
00162 }
00163
00164 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00165 {
00166 ASSERT(output);
00167
00168
00169 output-=start;
00170 ASSERT(start>=0);
00171 ASSERT(start<stop);
00172 ASSERT(stop<=get_num_vectors());
00173 uint32_t* index=new uint32_t[stop];
00174
00175 int32_t num_vectors=stop-start;
00176 ASSERT(num_vectors>0);
00177
00178 int32_t num_threads=parallel->get_num_threads();
00179 ASSERT(num_threads>0);
00180
00181 CSignal::clear_cancel();
00182
00183 if (dim != w_dim)
00184 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00185
00186 #ifndef WIN32
00187 if (num_threads < 2)
00188 {
00189 #endif
00190 HASHEDWD_THREAD_PARAM params;
00191 params.hf=this;
00192 params.sub_index=NULL;
00193 params.output=output;
00194 params.start=start;
00195 params.stop=stop;
00196 params.alphas=alphas;
00197 params.vec=vec;
00198 params.bias=b;
00199 params.progress=false;
00200 params.index=index;
00201 dense_dot_range_helper((void*) ¶ms);
00202 #ifndef WIN32
00203 }
00204 else
00205 {
00206 pthread_t* threads = new pthread_t[num_threads-1];
00207 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads];
00208 int32_t step= num_vectors/num_threads;
00209
00210 int32_t t;
00211
00212 for (t=0; t<num_threads-1; t++)
00213 {
00214 params[t].hf = this;
00215 params[t].sub_index=NULL;
00216 params[t].output = output;
00217 params[t].start = start+t*step;
00218 params[t].stop = start+(t+1)*step;
00219 params[t].alphas=alphas;
00220 params[t].vec=vec;
00221 params[t].bias=b;
00222 params[t].progress = false;
00223 params[t].index=index;
00224 pthread_create(&threads[t], NULL,
00225 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]);
00226 }
00227
00228 params[t].hf = this;
00229 params[t].sub_index=NULL;
00230 params[t].output = output;
00231 params[t].start = start+t*step;
00232 params[t].stop = stop;
00233 params[t].alphas=alphas;
00234 params[t].vec=vec;
00235 params[t].bias=b;
00236 params[t].progress = false;
00237 params[t].index=index;
00238 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]);
00239
00240 for (t=0; t<num_threads-1; t++)
00241 pthread_join(threads[t], NULL);
00242
00243 delete[] params;
00244 delete[] threads;
00245 delete[] index;
00246 }
00247 #endif
00248
00249 #ifndef WIN32
00250 if ( CSignal::cancel_computations() )
00251 SG_INFO( "prematurely stopped. \n");
00252 #endif
00253 }
00254
00255 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00256 {
00257 ASSERT(sub_index);
00258 ASSERT(output);
00259
00260 uint32_t* index=new uint32_t[num];
00261
00262 int32_t num_threads=parallel->get_num_threads();
00263 ASSERT(num_threads>0);
00264
00265 CSignal::clear_cancel();
00266
00267 if (dim != w_dim)
00268 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00269
00270 #ifndef WIN32
00271 if (num_threads < 2)
00272 {
00273 #endif
00274 HASHEDWD_THREAD_PARAM params;
00275 params.hf=this;
00276 params.sub_index=sub_index;
00277 params.output=output;
00278 params.start=0;
00279 params.stop=num;
00280 params.alphas=alphas;
00281 params.vec=vec;
00282 params.bias=b;
00283 params.progress=false;
00284 params.index=index;
00285 dense_dot_range_helper((void*) ¶ms);
00286 #ifndef WIN32
00287 }
00288 else
00289 {
00290 pthread_t* threads = new pthread_t[num_threads-1];
00291 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads];
00292 int32_t step= num/num_threads;
00293
00294 int32_t t;
00295
00296 for (t=0; t<num_threads-1; t++)
00297 {
00298 params[t].hf = this;
00299 params[t].sub_index=sub_index;
00300 params[t].output = output;
00301 params[t].start = t*step;
00302 params[t].stop = (t+1)*step;
00303 params[t].alphas=alphas;
00304 params[t].vec=vec;
00305 params[t].bias=b;
00306 params[t].progress = false;
00307 params[t].index=index;
00308 pthread_create(&threads[t], NULL,
00309 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]);
00310 }
00311
00312 params[t].hf = this;
00313 params[t].sub_index=sub_index;
00314 params[t].output = output;
00315 params[t].start = t*step;
00316 params[t].stop = num;
00317 params[t].alphas=alphas;
00318 params[t].vec=vec;
00319 params[t].bias=b;
00320 params[t].progress = false;
00321 params[t].index=index;
00322 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]);
00323
00324 for (t=0; t<num_threads-1; t++)
00325 pthread_join(threads[t], NULL);
00326
00327 delete[] params;
00328 delete[] threads;
00329 delete[] index;
00330 }
00331 #endif
00332
00333 #ifndef WIN32
00334 if ( CSignal::cancel_computations() )
00335 SG_INFO( "prematurely stopped. \n");
00336 #endif
00337 }
00338
00339 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p)
00340 {
00341 HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
00342 CHashedWDFeaturesTransposed* hf=par->hf;
00343 int32_t* sub_index=par->sub_index;
00344 float64_t* output=par->output;
00345 int32_t start=par->start;
00346 int32_t stop=par->stop;
00347 float64_t* alphas=par->alphas;
00348 float64_t* vec=par->vec;
00349 float64_t bias=par->bias;
00350 bool progress=par->progress;
00351 uint32_t* index=par->index;
00352 int32_t string_length=hf->string_length;
00353 int32_t degree=hf->degree;
00354 float64_t* wd_weights=hf->wd_weights;
00355 T_STRING<uint8_t>* transposed_strings=hf->transposed_strings;
00356 uint32_t mask=hf->mask;
00357 int32_t partial_w_dim=hf->partial_w_dim;
00358 float64_t normalization_const=hf->normalization_const;
00359
00360 if (sub_index)
00361 {
00362 for (int32_t j=start; j<stop; j++)
00363 output[j]=0.0;
00364
00365 uint32_t offs=0;
00366 for (int32_t i=0; i<string_length; i++)
00367 {
00368 uint32_t o=offs;
00369 for (int32_t k=0; k<degree && i+k<string_length; k++)
00370 {
00371 const float64_t wd = wd_weights[k];
00372 uint8_t* dim=transposed_strings[i+k].string;
00373 uint32_t h;
00374
00375 for (int32_t j=start; j<stop; j++)
00376 {
00377 uint8_t bval=dim[sub_index[j]];
00378 if (k==0)
00379 h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF);
00380 else
00381 h=CHash::IncrementalMurmurHash2(bval, index[j]);
00382 index[j]=h;
00383 output[j]+=vec[o + (h & mask)]*wd;
00384 }
00385 o+=partial_w_dim;
00386 }
00387 offs+=partial_w_dim*degree;
00388
00389 if (progress)
00390 hf->io->progress(i, 0,string_length, i);
00391 }
00392
00393 for (int32_t j=start; j<stop; j++)
00394 {
00395 if (alphas)
00396 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
00397 else
00398 output[j]=output[j]/normalization_const+bias;
00399 }
00400 }
00401 else
00402 {
00403 CMath::fill_vector(&output[start], stop-start, 0.0);
00404
00405 uint32_t offs=0;
00406 for (int32_t i=0; i<string_length; i++)
00407 {
00408 uint32_t o=offs;
00409 for (int32_t k=0; k<degree && i+k<string_length; k++)
00410 {
00411 const float64_t wd = wd_weights[k];
00412 uint8_t* dim=transposed_strings[i+k].string;
00413 uint32_t h;
00414
00415 for (int32_t j=start; j<stop; j++)
00416 {
00417 if (k==0)
00418 h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF);
00419 else
00420 h=CHash::IncrementalMurmurHash2(dim[j], index[j]);
00421 index[j]=h;
00422 output[j]+=vec[o + (h & mask)]*wd;
00423 }
00424 o+=partial_w_dim;
00425 }
00426 offs+=partial_w_dim*degree;
00427
00428 if (progress)
00429 hf->io->progress(i, 0,string_length, i);
00430 }
00431
00432 for (int32_t j=start; j<stop; j++)
00433 {
00434 if (alphas)
00435 output[j]=output[j]*alphas[j]/normalization_const+bias;
00436 else
00437 output[j]=output[j]/normalization_const+bias;
00438 }
00439 }
00440
00441 return NULL;
00442 }
00443
00444 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00445 {
00446 if (vec2_len != w_dim)
00447 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00448
00449 int32_t len;
00450 bool free_vec1;
00451 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00452 uint32_t* val=new uint32_t[len];
00453
00454 uint32_t offs=0;
00455 float64_t factor=alpha/normalization_const;
00456 if (abs_val)
00457 factor=CMath::abs(factor);
00458
00459 CMath::fill_vector(val, len, 0xDEADBEAF);
00460
00461 for (int32_t i=0; i<len; i++)
00462 {
00463 uint32_t o=offs;
00464 for (int32_t k=0; k<degree && i+k<len; k++)
00465 {
00466 float64_t wd = wd_weights[k]*factor;
00467
00468 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00469 val[i]=h;
00470
00471 #ifdef DEBUG_HASHEDWD
00472 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00473 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00474 #endif
00475 vec2[o+(h & mask)]+=wd;
00476 o+=partial_w_dim;
00477 }
00478 offs+=partial_w_dim*degree;
00479 }
00480
00481 delete[] val;
00482 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00483 }
00484
00485 void CHashedWDFeaturesTransposed::set_wd_weights()
00486 {
00487 ASSERT(degree>0);
00488
00489 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00490 partial_w_dim=1<<m_hash_bits;
00491 w_dim=partial_w_dim*string_length*(degree-start_degree);
00492
00493 wd_weights=new float64_t[degree];
00494
00495 for (int32_t i=0; i<degree; i++)
00496 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00497
00498 SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
00499 "dim=%d partial_dim=%d num=%d, len=%d\n",
00500 degree, from_degree, alphabet_size,
00501 w_dim, partial_w_dim, num_strings, string_length);
00502 }
00503
00504
00505 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n)
00506 {
00507 if (n==0)
00508 {
00509 normalization_const=0;
00510 for (int32_t i=0; i<degree; i++)
00511 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00512
00513 normalization_const=CMath::sqrt(normalization_const);
00514 }
00515 else
00516 normalization_const=n;
00517
00518 SG_DEBUG("normalization_const:%f\n", normalization_const);
00519 }
00520
00521 CFeatures* CHashedWDFeaturesTransposed::duplicate() const
00522 {
00523 return new CHashedWDFeaturesTransposed(*this);
00524 }