ImplicitWeightedSpecFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "features/ImplicitWeightedSpecFeatures.h"
00012 #include "lib/io.h"
00013 
00014 using namespace shogun;
00015 
00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00017 {
00018     ASSERT(str);
00019     strings=str;
00020     SG_REF(strings)
00021     normalization_factors=NULL;
00022     spec_weights=NULL;
00023     num_strings = str->get_num_vectors();
00024     alphabet_size = str->get_original_num_symbols();
00025     degree=str->get_order();
00026     set_wd_weights();
00027 
00028     SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00029             spec_size, num_strings);
00030 
00031     if (normalize)
00032         compute_normalization_const();
00033 }
00034 
00035 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00036 {
00037     float64_t* factors=new float64_t[num_strings];
00038 
00039     for (int32_t i=0; i<num_strings; i++)
00040         factors[i]=1.0/CMath::sqrt(dot(i,i));
00041 
00042     normalization_factors=factors;
00043     //CMath::display_vector(normalization_factors, num_strings, "n");
00044 }
00045 
00046 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00047 {
00048     delete[] spec_weights;
00049     spec_weights=new float64_t[degree];
00050 
00051     int32_t i;
00052     float64_t sum=0;
00053     spec_size=0;
00054 
00055     for (i=0; i<degree; i++)
00056     {
00057         spec_size+=CMath::pow(alphabet_size, i+1);
00058         spec_weights[i]=degree-i;
00059         sum+=spec_weights[i];
00060     }
00061     for (i=0; i<degree; i++)
00062         spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00063 
00064     return spec_weights!=NULL;
00065 }
00066 
00067 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00068 {
00069     ASSERT(d==degree);
00070 
00071     delete[] spec_weights;
00072     spec_weights=new float64_t[degree];
00073     for (int32_t i=0; i<degree; i++)
00074         spec_weights[i]=CMath::sqrt(w[i]);
00075     return true;
00076 }
00077 
00078 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig), 
00079     num_strings(orig.num_strings), 
00080     alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00081 {
00082     SG_NOTIMPLEMENTED;
00083     SG_REF(strings);
00084 }
00085 
00086 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00087 {
00088     SG_UNREF(strings);
00089     delete[] spec_weights;
00090     delete[] normalization_factors;
00091 }
00092 
00093 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00094 {
00095     ASSERT(vec_idx1 < num_strings);
00096     ASSERT(vec_idx2 < num_strings);
00097 
00098     int32_t len1=-1;
00099     int32_t len2=-1;
00100     bool free_vec1;
00101     bool free_vec2;
00102     uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00103     uint16_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00104 
00105     float64_t result=0;
00106     uint8_t mask=0;
00107 
00108     for (int32_t d=0; d<degree; d++)
00109     {
00110         mask = mask | (1 << (degree-d-1));
00111         uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00112 
00113         int32_t left_idx=0;
00114         int32_t right_idx=0;
00115         float64_t weight=spec_weights[d]*spec_weights[d];
00116 
00117         while (left_idx < len1 && right_idx < len2)
00118         {
00119             uint16_t lsym=vec1[left_idx] & masked;
00120             uint16_t rsym=vec2[right_idx] & masked;
00121 
00122             if (lsym == rsym)
00123             {
00124                 int32_t old_left_idx=left_idx;
00125                 int32_t old_right_idx=right_idx;
00126 
00127                 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00128                     left_idx++;
00129 
00130                 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00131                     right_idx++;
00132 
00133                 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00134             }
00135             else if (lsym<rsym)
00136                 left_idx++;
00137             else
00138                 right_idx++;
00139         }
00140     }
00141 
00142     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00143     strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00144 
00145     if (normalization_factors)
00146         return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00147     else
00148         return result;
00149 }
00150 
00151 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00152 {
00153     ASSERT(vec2_len == spec_size);
00154     ASSERT(vec_idx1 < num_strings);
00155 
00156     float64_t result=0;
00157     int32_t len1=-1;
00158     bool free_vec1;
00159     uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00160 
00161     if (vec1 && len1>0)
00162     {
00163         for (int32_t j=0; j<len1; j++)
00164         {
00165             uint8_t mask=0;
00166             int32_t offs=0;
00167             uint16_t v=*vec1++;
00168 
00169             for (int32_t d=0; d<degree; d++)
00170             {
00171                 mask = mask | (1 << (degree-d-1));
00172                 int32_t idx=strings->get_masked_symbols(v, mask);
00173                 idx=strings->shift_symbol(idx, degree-d-1);
00174                 result += vec2[offs + idx]*spec_weights[d];
00175                 offs+=strings->shift_offset(1,d+1);
00176             }
00177         }
00178 
00179         strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00180 
00181         if (normalization_factors)
00182             result*=normalization_factors[vec_idx1];
00183     }
00184     else
00185         SG_ERROR("huh?\n");
00186 
00187     return result;
00188 }
00189 
00190 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00191 {
00192     int32_t len1=-1;
00193     bool free_vec1;
00194     uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00195 
00196     if (normalization_factors)
00197         alpha*=normalization_factors[vec_idx1];
00198 
00199     if (vec && len1>0)
00200     {
00201         for (int32_t j=0; j<len1; j++)
00202         {
00203             uint8_t mask=0;
00204             int32_t offs=0;
00205             for (int32_t d=0; d<degree; d++)
00206             {
00207                 mask = mask | (1 << (degree-d-1));
00208                 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00209                 idx=strings->shift_symbol(idx, degree-d-1);
00210                 if (abs_val)
00211                     vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00212                 else
00213                     vec2[offs + idx] += alpha*spec_weights[d];
00214                 offs+=strings->shift_offset(1,d+1);
00215             }
00216         }
00217     }
00218 
00219     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00220 }
00221 
00222 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00223 {
00224     return new CImplicitWeightedSpecFeatures(*this);
00225 }
00226 
00227 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index)
00228 {
00229     if (vector_index>=num_strings)
00230     {
00231         SG_ERROR("Index out of bounds (number of strings %d, you "
00232                 "requested %d)\n", num_strings, vector_index);
00233     }
00234 
00235     wspec_feature_iterator* it=new wspec_feature_iterator[1];
00236     it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00237     it->vidx=vector_index;
00238 
00239     it->offs=0;
00240     it->d=0;
00241     it->j=0;
00242     it->mask=0;
00243     it->alpha=normalization_factors[vector_index];
00244 
00245     return it;
00246 }
00247 
00248 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00249 {
00250     wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00251 
00252     if (it->d>=degree)
00253     {
00254         if (it->j < it->vlen-1)
00255         {
00256             it->j++;
00257             it->d=0;
00258             it->mask=0;
00259             it->offs=0;
00260         }
00261         else
00262             return false;
00263     }
00264 
00265     int32_t d=it->d;
00266 
00267     it->mask = it->mask | (1 << (degree-d-1));
00268     int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask);
00269     idx=strings->shift_symbol(idx, degree-d-1);
00270     value=it->alpha*spec_weights[d];
00271     index=it->offs + idx;
00272     it->offs+=strings->shift_offset(1,d+1);
00273 
00274     it->d=d+1;
00275     return true;
00276 }
00277 
00278 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator)
00279 {
00280     ASSERT(iterator);
00281     wspec_feature_iterator* it=(wspec_feature_iterator*) iterator;
00282     strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00283     delete[] it;
00284 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation