HashedWDFeaturesTransposed.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include "features/HashedWDFeaturesTransposed.h"
00012 #include "lib/io.h"
00013 #include "lib/Signal.h"
00014 #include "base/Parallel.h"
00015 
00016 #ifndef WIN32
00017 #include <pthread.h>
00018 #endif
00019 
00020 using namespace shogun;
00021 
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023 struct HASHEDWD_THREAD_PARAM
00024 {
00025     CHashedWDFeaturesTransposed* hf;
00026     int32_t* sub_index;
00027     float64_t* output;
00028     int32_t start;
00029     int32_t stop;
00030     float64_t* alphas;
00031     float64_t* vec;
00032     float64_t bias;
00033     bool progress;
00034     uint32_t* index;
00035 };
00036 #endif // DOXYGEN_SHOULD_SKIP_THIS
00037 
00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str,
00039         int32_t start_order, int32_t order, int32_t from_order,
00040         int32_t hash_bits) : CDotFeatures()
00041 {
00042     ASSERT(start_order>=0);
00043     ASSERT(start_order<order);
00044     ASSERT(order<=from_order);
00045     ASSERT(hash_bits>0);
00046     ASSERT(str);
00047     ASSERT(str->have_same_length());
00048     SG_REF(str);
00049 
00050     strings=str;
00051     int32_t transposed_num_feat=0;
00052     int32_t transposed_num_vec=0;
00053     transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
00054 
00055     string_length=str->get_max_vector_length();
00056     num_strings=str->get_num_vectors();
00057     ASSERT(transposed_num_feat==num_strings);
00058     ASSERT(transposed_num_vec==string_length);
00059 
00060     CAlphabet* alpha=str->get_alphabet();
00061     alphabet_size=alpha->get_num_symbols();
00062     SG_UNREF(alpha);
00063 
00064     degree=order;
00065     start_degree=start_order;
00066     if (start_degree!=0)
00067         SG_NOTIMPLEMENTED;
00068     from_degree=from_order;
00069     m_hash_bits=hash_bits;
00070     set_wd_weights();
00071     set_normalization_const();
00072 }
00073 
00074 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig)
00075     : CDotFeatures(orig), strings(orig.strings), transposed_strings(transposed_strings),
00076     degree(orig.degree), start_degree(orig.start_degree), 
00077     from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00078     normalization_const(orig.normalization_const)
00079 {
00080     SG_REF(strings);
00081     string_length=strings->get_max_vector_length();
00082     num_strings=strings->get_num_vectors();
00083     CAlphabet* alpha=strings->get_alphabet();
00084     alphabet_size=alpha->get_num_symbols();
00085     SG_UNREF(alpha);
00086 
00087     set_wd_weights();
00088 }
00089 
00090 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed()
00091 {
00092     for (int32_t i=0; i<string_length; i++)
00093         delete[] transposed_strings[i].string;
00094     delete[] transposed_strings;
00095 
00096     SG_UNREF(strings);
00097     delete[] wd_weights;
00098 }
00099 
00100 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, int32_t vec_idx2)
00101 {
00102     int32_t len1, len2;
00103     bool free_vec1, free_vec2;
00104 
00105     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00106     uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00107 
00108     ASSERT(len1==len2);
00109 
00110     float64_t sum=0.0;
00111 
00112     for (int32_t i=0; i<len1; i++)
00113     {
00114         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00115         {
00116             if (vec1[i+j]!=vec2[i+j])
00117                 break;
00118             if (j>=start_degree)
00119                 sum += wd_weights[j]*wd_weights[j];
00120         }
00121     }
00122     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00123     strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00124     return sum/CMath::sq(normalization_const);
00125 }
00126 
00127 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00128 {
00129     if (vec2_len != w_dim)
00130         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00131 
00132     float64_t sum=0;
00133     int32_t len;
00134     bool free_vec1;
00135     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00136     uint32_t* val=new uint32_t[len];
00137 
00138     uint32_t offs=0;
00139 
00140     CMath::fill_vector(val, len, 0xDEADBEAF);
00141 
00142     for (int32_t i=0; i < len; i++) 
00143     {
00144         uint32_t o=offs;
00145         for (int32_t k=0; k<degree && i+k<len; k++)
00146         {
00147             const float64_t wd = wd_weights[k];
00148             const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00149             val[i]=h;
00150 #ifdef DEBUG_HASHEDWD
00151             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00152 #endif
00153             sum+=vec2[o+(h & mask)]*wd;
00154             o+=partial_w_dim;
00155         }
00156         offs+=partial_w_dim*degree;
00157     }
00158     delete[] val;
00159     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00160 
00161     return sum/normalization_const;
00162 }
00163 
00164 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00165 {
00166     ASSERT(output);
00167     // write access is internally between output[start..stop] so the following
00168     // line is necessary to write to output[0...(stop-start-1)]
00169     output-=start; 
00170     ASSERT(start>=0);
00171     ASSERT(start<stop);
00172     ASSERT(stop<=get_num_vectors());
00173     uint32_t* index=new uint32_t[stop];
00174 
00175     int32_t num_vectors=stop-start;
00176     ASSERT(num_vectors>0);
00177 
00178     int32_t num_threads=parallel->get_num_threads();
00179     ASSERT(num_threads>0);
00180 
00181     CSignal::clear_cancel();
00182 
00183     if (dim != w_dim)
00184         SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00185 
00186 #ifndef WIN32
00187     if (num_threads < 2)
00188     {
00189 #endif
00190         HASHEDWD_THREAD_PARAM params;
00191         params.hf=this;
00192         params.sub_index=NULL;
00193         params.output=output;
00194         params.start=start;
00195         params.stop=stop;
00196         params.alphas=alphas;
00197         params.vec=vec;
00198         params.bias=b;
00199         params.progress=false; //true;
00200         params.index=index;
00201         dense_dot_range_helper((void*) &params);
00202 #ifndef WIN32
00203     }
00204     else
00205     {
00206         pthread_t* threads = new pthread_t[num_threads-1];
00207         HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads];
00208         int32_t step= num_vectors/num_threads;
00209 
00210         int32_t t;
00211 
00212         for (t=0; t<num_threads-1; t++)
00213         {
00214             params[t].hf = this;
00215             params[t].sub_index=NULL;
00216             params[t].output = output;
00217             params[t].start = start+t*step;
00218             params[t].stop = start+(t+1)*step;
00219             params[t].alphas=alphas;
00220             params[t].vec=vec;
00221             params[t].bias=b;
00222             params[t].progress = false;
00223             params[t].index=index;
00224             pthread_create(&threads[t], NULL,
00225                     CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)&params[t]);
00226         }
00227 
00228         params[t].hf = this;
00229         params[t].sub_index=NULL;
00230         params[t].output = output;
00231         params[t].start = start+t*step;
00232         params[t].stop = stop;
00233         params[t].alphas=alphas;
00234         params[t].vec=vec;
00235         params[t].bias=b;
00236         params[t].progress = false; //true;
00237         params[t].index=index;
00238         CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) &params[t]);
00239 
00240         for (t=0; t<num_threads-1; t++)
00241             pthread_join(threads[t], NULL);
00242 
00243         delete[] params;
00244         delete[] threads;
00245         delete[] index;
00246     }
00247 #endif
00248 
00249 #ifndef WIN32
00250         if ( CSignal::cancel_computations() )
00251             SG_INFO( "prematurely stopped.           \n");
00252 #endif
00253 }
00254 
00255 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
00256 {
00257     ASSERT(sub_index);
00258     ASSERT(output);
00259 
00260     uint32_t* index=new uint32_t[num];
00261 
00262     int32_t num_threads=parallel->get_num_threads();
00263     ASSERT(num_threads>0);
00264 
00265     CSignal::clear_cancel();
00266 
00267     if (dim != w_dim)
00268         SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
00269 
00270 #ifndef WIN32
00271     if (num_threads < 2)
00272     {
00273 #endif
00274         HASHEDWD_THREAD_PARAM params;
00275         params.hf=this;
00276         params.sub_index=sub_index;
00277         params.output=output;
00278         params.start=0;
00279         params.stop=num;
00280         params.alphas=alphas;
00281         params.vec=vec;
00282         params.bias=b;
00283         params.progress=false; //true;
00284         params.index=index;
00285         dense_dot_range_helper((void*) &params);
00286 #ifndef WIN32
00287     }
00288     else
00289     {
00290         pthread_t* threads = new pthread_t[num_threads-1];
00291         HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads];
00292         int32_t step= num/num_threads;
00293 
00294         int32_t t;
00295 
00296         for (t=0; t<num_threads-1; t++)
00297         {
00298             params[t].hf = this;
00299             params[t].sub_index=sub_index;
00300             params[t].output = output;
00301             params[t].start = t*step;
00302             params[t].stop = (t+1)*step;
00303             params[t].alphas=alphas;
00304             params[t].vec=vec;
00305             params[t].bias=b;
00306             params[t].progress = false;
00307             params[t].index=index;
00308             pthread_create(&threads[t], NULL,
00309                     CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)&params[t]);
00310         }
00311 
00312         params[t].hf = this;
00313         params[t].sub_index=sub_index;
00314         params[t].output = output;
00315         params[t].start = t*step;
00316         params[t].stop = num;
00317         params[t].alphas=alphas;
00318         params[t].vec=vec;
00319         params[t].bias=b;
00320         params[t].progress = false; //true;
00321         params[t].index=index;
00322         CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) &params[t]);
00323 
00324         for (t=0; t<num_threads-1; t++)
00325             pthread_join(threads[t], NULL);
00326 
00327         delete[] params;
00328         delete[] threads;
00329         delete[] index;
00330     }
00331 #endif
00332 
00333 #ifndef WIN32
00334         if ( CSignal::cancel_computations() )
00335             SG_INFO( "prematurely stopped.           \n");
00336 #endif
00337 }
00338 
00339 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p)
00340 {
00341     HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
00342     CHashedWDFeaturesTransposed* hf=par->hf;
00343     int32_t* sub_index=par->sub_index;
00344     float64_t* output=par->output;
00345     int32_t start=par->start;
00346     int32_t stop=par->stop;
00347     float64_t* alphas=par->alphas;
00348     float64_t* vec=par->vec;
00349     float64_t bias=par->bias;
00350     bool progress=par->progress;
00351     uint32_t* index=par->index;
00352     int32_t string_length=hf->string_length;
00353     int32_t degree=hf->degree;
00354     float64_t* wd_weights=hf->wd_weights;
00355     T_STRING<uint8_t>* transposed_strings=hf->transposed_strings;
00356     uint32_t mask=hf->mask;
00357     int32_t partial_w_dim=hf->partial_w_dim;
00358     float64_t normalization_const=hf->normalization_const;
00359 
00360     if (sub_index)
00361     {
00362         for (int32_t j=start; j<stop; j++)
00363             output[j]=0.0;
00364 
00365         uint32_t offs=0;
00366         for (int32_t i=0; i<string_length; i++)
00367         {
00368             uint32_t o=offs;
00369             for (int32_t k=0; k<degree && i+k<string_length; k++)
00370             {
00371                 const float64_t wd = wd_weights[k];
00372                 uint8_t* dim=transposed_strings[i+k].string;
00373                 uint32_t h;
00374 
00375                 for (int32_t j=start; j<stop; j++)
00376                 {
00377                     uint8_t bval=dim[sub_index[j]];
00378                     if (k==0)
00379                         h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF);
00380                     else
00381                         h=CHash::IncrementalMurmurHash2(bval, index[j]);
00382                     index[j]=h;
00383                     output[j]+=vec[o + (h & mask)]*wd;
00384                 }
00385                 o+=partial_w_dim;
00386             }
00387             offs+=partial_w_dim*degree;
00388 
00389             if (progress)
00390                 hf->io->progress(i, 0,string_length, i);
00391         }
00392 
00393         for (int32_t j=start; j<stop; j++)
00394         {
00395             if (alphas)
00396                 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
00397             else
00398                 output[j]=output[j]/normalization_const+bias;
00399         }
00400     }
00401     else
00402     {
00403         CMath::fill_vector(&output[start], stop-start, 0.0);
00404 
00405         uint32_t offs=0;
00406         for (int32_t i=0; i<string_length; i++)
00407         {
00408             uint32_t o=offs;
00409             for (int32_t k=0; k<degree && i+k<string_length; k++)
00410             {
00411                 const float64_t wd = wd_weights[k];
00412                 uint8_t* dim=transposed_strings[i+k].string;
00413                 uint32_t h;
00414 
00415                 for (int32_t j=start; j<stop; j++)
00416                 {
00417                     if (k==0)
00418                         h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF);
00419                     else
00420                         h=CHash::IncrementalMurmurHash2(dim[j], index[j]);
00421                     index[j]=h;
00422                     output[j]+=vec[o + (h & mask)]*wd;
00423                 }
00424                 o+=partial_w_dim;
00425             }
00426             offs+=partial_w_dim*degree;
00427 
00428             if (progress)
00429                 hf->io->progress(i, 0,string_length, i);
00430         }
00431 
00432         for (int32_t j=start; j<stop; j++)
00433         {
00434             if (alphas)
00435                 output[j]=output[j]*alphas[j]/normalization_const+bias;
00436             else
00437                 output[j]=output[j]/normalization_const+bias;
00438         }
00439     }
00440 
00441     return NULL;
00442 }
00443 
00444 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00445 {
00446     if (vec2_len != w_dim)
00447         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00448 
00449     int32_t len;
00450     bool free_vec1;
00451     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00452     uint32_t* val=new uint32_t[len];
00453 
00454     uint32_t offs=0;
00455     float64_t factor=alpha/normalization_const;
00456     if (abs_val)
00457         factor=CMath::abs(factor);
00458 
00459     CMath::fill_vector(val, len, 0xDEADBEAF);
00460 
00461     for (int32_t i=0; i<len; i++) 
00462     {
00463         uint32_t o=offs;
00464         for (int32_t k=0; k<degree && i+k<len; k++)
00465         {
00466             float64_t wd = wd_weights[k]*factor;
00467 
00468             const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00469             val[i]=h;
00470 
00471 #ifdef DEBUG_HASHEDWD
00472             SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00473             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00474 #endif
00475             vec2[o+(h & mask)]+=wd;
00476             o+=partial_w_dim;
00477         }
00478         offs+=partial_w_dim*degree;
00479     }
00480 
00481     delete[] val;
00482     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00483 }
00484 
00485 void CHashedWDFeaturesTransposed::set_wd_weights()
00486 {
00487     ASSERT(degree>0);
00488 
00489     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00490     partial_w_dim=1<<m_hash_bits;
00491     w_dim=partial_w_dim*string_length*(degree-start_degree);
00492 
00493     wd_weights=new float64_t[degree];
00494 
00495     for (int32_t i=0; i<degree; i++)
00496         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00497 
00498     SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
00499             "dim=%d partial_dim=%d num=%d, len=%d\n", 
00500             degree, from_degree, alphabet_size, 
00501             w_dim, partial_w_dim, num_strings, string_length);
00502 }
00503 
00504 
00505 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n)
00506 {
00507     if (n==0)
00508     {
00509         normalization_const=0;
00510         for (int32_t i=0; i<degree; i++)
00511             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00512 
00513         normalization_const=CMath::sqrt(normalization_const);
00514     }
00515     else
00516         normalization_const=n;
00517 
00518     SG_DEBUG("normalization_const:%f\n", normalization_const);
00519 }
00520 
00521 CFeatures* CHashedWDFeaturesTransposed::duplicate() const
00522 {
00523     return new CHashedWDFeaturesTransposed(*this);
00524 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation