HashedWDFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include "features/HashedWDFeatures.h"
00012 #include "lib/io.h"
00013 
00014 using namespace shogun;
00015 
00016 CHashedWDFeatures::CHashedWDFeatures(CStringFeatures<uint8_t>* str,
00017         int32_t start_order, int32_t order, int32_t from_order,
00018         int32_t hash_bits) : CDotFeatures()
00019 {
00020     ASSERT(start_order>=0);
00021     ASSERT(start_order<order);
00022     ASSERT(order<=from_order);
00023     ASSERT(hash_bits>0);
00024     ASSERT(str);
00025     ASSERT(str->have_same_length());
00026     SG_REF(str);
00027 
00028     strings=str;
00029     string_length=str->get_max_vector_length();
00030     num_strings=str->get_num_vectors();
00031     CAlphabet* alpha=str->get_alphabet();
00032     alphabet_size=alpha->get_num_symbols();
00033     SG_UNREF(alpha);
00034 
00035     degree=order;
00036     start_degree=start_order;
00037     from_degree=from_order;
00038     m_hash_bits=hash_bits;
00039     set_wd_weights();
00040     set_normalization_const();
00041 }
00042 
00043 CHashedWDFeatures::CHashedWDFeatures(const CHashedWDFeatures& orig)
00044     : CDotFeatures(orig), strings(orig.strings),
00045     degree(orig.degree), start_degree(orig.start_degree), 
00046     from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00047     normalization_const(orig.normalization_const)
00048 {
00049     SG_REF(strings);
00050     string_length=strings->get_max_vector_length();
00051     num_strings=strings->get_num_vectors();
00052     CAlphabet* alpha=strings->get_alphabet();
00053     alphabet_size=alpha->get_num_symbols();
00054     SG_UNREF(alpha);
00055 
00056     set_wd_weights();
00057 }
00058 
00059 CHashedWDFeatures::~CHashedWDFeatures()
00060 {
00061     SG_UNREF(strings);
00062     delete[] wd_weights;
00063 }
00064 
00065 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00066 {
00067     int32_t len1, len2;
00068     bool free_vec1, free_vec2;
00069 
00070     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00071     uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00072 
00073     ASSERT(len1==len2);
00074 
00075     float64_t sum=0.0;
00076 
00077     for (int32_t i=0; i<len1; i++)
00078     {
00079         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00080         {
00081             if (vec1[i+j]!=vec2[i+j])
00082                 break;
00083             if (j>=start_degree)
00084                 sum += wd_weights[j]*wd_weights[j];
00085         }
00086     }
00087     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00088     strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00089     return sum/CMath::sq(normalization_const);
00090 }
00091 
00092 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00093 {
00094     if (vec2_len != w_dim)
00095         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00096 
00097     float64_t sum=0;
00098     int32_t lim=CMath::min(degree, string_length);
00099     int32_t len;
00100     bool free_vec1;
00101     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00102     uint32_t* val=new uint32_t[len];
00103 
00104     uint32_t offs=0;
00105 
00106     if (start_degree>0)
00107     {
00108         // compute hash for strings of length start_degree-1
00109         for (int32_t i=0; i+start_degree < len; i++) 
00110             val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF);
00111     }
00112     else
00113         CMath::fill_vector(val, len, 0xDEADBEAF);
00114 
00115     for (int32_t k=start_degree; k<lim; k++)
00116     {
00117         float64_t wd = wd_weights[k];
00118 
00119         uint32_t o=offs;
00120         for (int32_t i=0; i+k < len; i++) 
00121         {
00122             const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00123             val[i]=h;
00124 #ifdef DEBUG_HASHEDWD
00125             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00126 #endif
00127             sum+=vec2[o+(h & mask)]*wd;
00128             o+=partial_w_dim;
00129         }
00130         offs+=partial_w_dim*len;
00131     }
00132     delete[] val;
00133     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00134 
00135     return sum/normalization_const;
00136 }
00137 
00138 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00139 {
00140     if (vec2_len != w_dim)
00141         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00142 
00143     int32_t lim=CMath::min(degree, string_length);
00144     int32_t len;
00145     bool free_vec1;
00146     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00147     uint32_t* val=new uint32_t[len];
00148 
00149     uint32_t offs=0;
00150 
00151     if (start_degree>0)
00152     {
00153         // compute hash for strings of length start_degree-1
00154         for (int32_t i=0; i+start_degree < len; i++) 
00155             val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF);
00156     }
00157     else
00158         CMath::fill_vector(val, len, 0xDEADBEAF);
00159 
00160     for (int32_t k=start_degree; k<lim; k++)
00161     {
00162         float64_t wd = alpha*wd_weights[k]/normalization_const;
00163 
00164         if (abs_val)
00165             wd=CMath::abs(wd);
00166 
00167         uint32_t o=offs;
00168         for (int32_t i=0; i+k < len; i++) 
00169         {
00170             const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
00171             val[i]=h;
00172 
00173 #ifdef DEBUG_HASHEDWD
00174             SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
00175             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
00176 #endif
00177             vec2[o+(h & mask)]+=wd;
00178             o+=partial_w_dim;
00179         }
00180         offs+=partial_w_dim*len;
00181     }
00182 
00183     delete[] val;
00184     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00185 }
00186 
00187 void CHashedWDFeatures::set_wd_weights()
00188 {
00189     ASSERT(degree>0);
00190 
00191     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00192     partial_w_dim=1<<m_hash_bits;
00193     w_dim=partial_w_dim*string_length*(degree-start_degree);
00194 
00195     wd_weights=new float64_t[degree];
00196 
00197     for (int32_t i=0; i<degree; i++)
00198         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00199 
00200     SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, "
00201             "dim=%d partial_dim=%d num=%d, len=%d\n", 
00202             degree, from_degree, alphabet_size, 
00203             w_dim, partial_w_dim, num_strings, string_length);
00204 }
00205 
00206 
00207 void CHashedWDFeatures::set_normalization_const(float64_t n)
00208 {
00209     if (n==0)
00210     {
00211         normalization_const=0;
00212         for (int32_t i=0; i<degree; i++)
00213             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00214 
00215         normalization_const=CMath::sqrt(normalization_const);
00216     }
00217     else
00218         normalization_const=n;
00219 
00220     SG_DEBUG("normalization_const:%f\n", normalization_const);
00221 }
00222 
00223 CFeatures* CHashedWDFeatures::duplicate() const
00224 {
00225     return new CHashedWDFeatures(*this);
00226 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation