ImplicitWeightedSpecFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "features/ImplicitWeightedSpecFeatures.h"
00012 #include "lib/io.h"
00013 
00014 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00015 {
00016     ASSERT(str);
00017     strings=str;
00018     SG_REF(strings)
00019     normalization_factors=NULL;
00020     spec_weights=NULL;
00021     num_strings = str->get_num_vectors();
00022     alphabet_size = str->get_original_num_symbols();
00023     degree=str->get_order();
00024     set_wd_weights();
00025 
00026     SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00027             spec_size, num_strings);
00028 
00029     if (normalize)
00030         compute_normalization_const();
00031 }
00032 
00033 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00034 {
00035     float64_t* factors=new float64_t[num_strings];
00036 
00037     for (int32_t i=0; i<num_strings; i++)
00038         factors[i]=1.0/CMath::sqrt(dot(i,i));
00039 
00040     normalization_factors=factors;
00041     //CMath::display_vector(normalization_factors, num_strings, "n");
00042 }
00043 
00044 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00045 {
00046     delete[] spec_weights;
00047     spec_weights=new float64_t[degree];
00048 
00049     int32_t i;
00050     float64_t sum=0;
00051     spec_size=0;
00052 
00053     for (i=0; i<degree; i++)
00054     {
00055         spec_size+=CMath::pow(alphabet_size, i+1);
00056         spec_weights[i]=degree-i;
00057         sum+=spec_weights[i];
00058     }
00059     for (i=0; i<degree; i++)
00060         spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00061 
00062     return spec_weights!=NULL;
00063 }
00064 
00065 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00066 {
00067     ASSERT(d==degree);
00068 
00069     delete[] spec_weights;
00070     spec_weights=new float64_t[degree];
00071     for (int32_t i=0; i<degree; i++)
00072         spec_weights[i]=CMath::sqrt(w[i]);
00073     return true;
00074 }
00075 
00076 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig), 
00077     num_strings(orig.num_strings), 
00078     alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00079 {
00080     SG_NOTIMPLEMENTED;
00081     SG_REF(strings);
00082 }
00083 
00084 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00085 {
00086     SG_UNREF(strings);
00087     delete[] spec_weights;
00088     delete[] normalization_factors;
00089 }
00090 
00091 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00092 {
00093     ASSERT(vec_idx1 < num_strings);
00094     ASSERT(vec_idx2 < num_strings);
00095 
00096     int32_t len1=-1;
00097     int32_t len2=-1;
00098     uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1);
00099     uint16_t* vec2=strings->get_feature_vector(vec_idx2, len2);
00100 
00101     float64_t result=0;
00102     uint8_t mask=0;
00103 
00104     for (int32_t d=0; d<degree; d++)
00105     {
00106         mask = mask | (1 << (degree-d-1));
00107         uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00108 
00109         int32_t left_idx=0;
00110         int32_t right_idx=0;
00111         float64_t weight=spec_weights[d]*spec_weights[d];
00112 
00113         while (left_idx < len1 && right_idx < len2)
00114         {
00115             uint16_t lsym=vec1[left_idx] & masked;
00116             uint16_t rsym=vec2[right_idx] & masked;
00117 
00118             if (lsym == rsym)
00119             {
00120                 int32_t old_left_idx=left_idx;
00121                 int32_t old_right_idx=right_idx;
00122 
00123                 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00124                     left_idx++;
00125 
00126                 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00127                     right_idx++;
00128 
00129                 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00130             }
00131             else if (lsym<rsym)
00132                 left_idx++;
00133             else
00134                 right_idx++;
00135         }
00136     }
00137 
00138     if (normalization_factors)
00139         return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00140     else
00141         return result;
00142 }
00143 
00144 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00145 {
00146     ASSERT(vec2_len == spec_size);
00147     ASSERT(vec_idx1 < num_strings);
00148 
00149     float64_t result=0;
00150     int32_t len1=-1;
00151     uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1);
00152 
00153     if (vec1 && len1>0)
00154     {
00155         for (int32_t j=0; j<len1; j++)
00156         {
00157             uint8_t mask=0;
00158             int32_t offs=0;
00159             uint16_t v=*vec1++;
00160 
00161             for (int32_t d=0; d<degree; d++)
00162             {
00163                 mask = mask | (1 << (degree-d-1));
00164                 int32_t idx=strings->get_masked_symbols(v, mask);
00165                 idx=strings->shift_symbol(idx, degree-d-1);
00166                 result += vec2[offs + idx]*spec_weights[d];
00167                 offs+=strings->shift_offset(1,d+1);
00168             }
00169         }
00170 
00171         if (normalization_factors)
00172             result*=normalization_factors[vec_idx1];
00173     }
00174     else
00175         SG_ERROR("huh?\n");
00176 
00177     return result;
00178 }
00179 
00180 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00181 {
00182     int32_t len1=-1;
00183     uint16_t* vec=strings->get_feature_vector(vec_idx1, len1);
00184 
00185     if (normalization_factors)
00186         alpha*=normalization_factors[vec_idx1];
00187 
00188     if (vec && len1>0)
00189     {
00190         for (int32_t j=0; j<len1; j++)
00191         {
00192             uint8_t mask=0;
00193             int32_t offs=0;
00194             for (int32_t d=0; d<degree; d++)
00195             {
00196                 mask = mask | (1 << (degree-d-1));
00197                 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00198                 idx=strings->shift_symbol(idx, degree-d-1);
00199                 if (abs_val)
00200                     vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00201                 else
00202                     vec2[offs + idx] += alpha*spec_weights[d];
00203                 offs+=strings->shift_offset(1,d+1);
00204             }
00205         }
00206     }
00207 }
00208 
00209 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00210 {
00211     return new CImplicitWeightedSpecFeatures(*this);
00212 }

SHOGUN Machine Learning Toolbox - Documentation