SalzbergWordStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Gunnar Raetsch
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "lib/common.h"
00012 #include "lib/io.h"
00013 #include "kernel/SalzbergWordStringKernel.h"
00014 #include "features/Features.h"
00015 #include "features/StringFeatures.h"
00016 #include "features/Labels.h"
00017 #include "classifier/PluginEstimate.h"
00018 
00019 using namespace shogun;
00020 
00021 CSalzbergWordStringKernel::CSalzbergWordStringKernel(int32_t size, CPluginEstimate* pie, CLabels* labels)
00022 : CStringKernel<uint16_t>(size), estimate(pie), mean(NULL), variance(NULL),
00023     sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL),
00024     ld_mean_lhs(NULL), ld_mean_rhs(NULL),
00025     num_params(0), num_symbols(0), sum_m2_s2(0), pos_prior(0.5),
00026     neg_prior(0.5), initialized(false)
00027 {
00028     if (labels)
00029         set_prior_probs_from_labels(labels);
00030 }
00031 
00032 CSalzbergWordStringKernel::CSalzbergWordStringKernel(
00033     CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r,
00034     CPluginEstimate* pie, CLabels* labels)
00035 : CStringKernel<uint16_t>(10),estimate(pie), mean(NULL), variance(NULL),
00036     sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL),
00037     ld_mean_lhs(NULL), ld_mean_rhs(NULL),
00038     num_params(0), num_symbols(0), sum_m2_s2(0), pos_prior(0.5),
00039     neg_prior(0.5), initialized(false)
00040 {
00041     if (labels)
00042         set_prior_probs_from_labels(labels);
00043 
00044     init(l, r);
00045 }
00046 
00047 CSalzbergWordStringKernel::~CSalzbergWordStringKernel()
00048 {
00049     cleanup();
00050 }
00051 
00052 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r)
00053 {
00054     CStringKernel<uint16_t>::init(p_l,p_r);
00055     CStringFeatures<uint16_t>* l=(CStringFeatures<uint16_t>*) p_l;
00056     ASSERT(l);
00057     CStringFeatures<uint16_t>* r=(CStringFeatures<uint16_t>*) p_r;
00058     ASSERT(r);
00059 
00060     int32_t i;
00061     initialized=false;
00062 
00063     if (sqrtdiag_lhs!=sqrtdiag_rhs)
00064         delete[] sqrtdiag_rhs;
00065     sqrtdiag_rhs=NULL;
00066     delete[] sqrtdiag_lhs;
00067     sqrtdiag_lhs=NULL;
00068     if (ld_mean_lhs!=ld_mean_rhs)
00069         delete[] ld_mean_rhs;
00070     ld_mean_rhs=NULL;
00071     delete[] ld_mean_lhs;
00072     ld_mean_lhs=NULL;
00073 
00074     sqrtdiag_lhs=new float64_t[l->get_num_vectors()];
00075     ld_mean_lhs=new float64_t[l->get_num_vectors()];
00076 
00077     for (i=0; i<l->get_num_vectors(); i++)
00078         sqrtdiag_lhs[i]=1;
00079 
00080     if (l==r)
00081     {
00082         sqrtdiag_rhs=sqrtdiag_lhs;
00083         ld_mean_rhs=ld_mean_lhs;
00084     }
00085     else
00086     {
00087         sqrtdiag_rhs=new float64_t[r->get_num_vectors()];
00088         for (i=0; i<r->get_num_vectors(); i++)
00089             sqrtdiag_rhs[i]=1;
00090 
00091         ld_mean_rhs=new float64_t[r->get_num_vectors()];
00092     }
00093 
00094     float64_t* l_ld_mean_lhs=ld_mean_lhs;
00095     float64_t* l_ld_mean_rhs=ld_mean_rhs;
00096 
00097     //from our knowledge first normalize variance to 1 and then norm=1 does the job
00098     if (!initialized)
00099     {
00100         int32_t num_vectors=l->get_num_vectors();
00101         num_symbols=(int32_t) l->get_num_symbols();
00102         int32_t llen=l->get_vector_length(0);
00103         int32_t rlen=r->get_vector_length(0);
00104         num_params=(int32_t) llen*l->get_num_symbols();
00105         int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols();
00106         if ((!estimate) || (!estimate->check_models()))
00107         {
00108             SG_ERROR( "no estimate available\n");
00109             return false ;
00110         } ;
00111         if (num_params2!=estimate->get_num_params())
00112         {
00113             SG_ERROR( "number of parameters of estimate and feature representation do not match\n");
00114             return false ;
00115         } ;
00116 
00117         delete[] variance;
00118         delete[] mean;
00119         mean=new float64_t[num_params];
00120         ASSERT(mean);
00121         variance=new float64_t[num_params];
00122         ASSERT(variance);
00123 
00124         for (i=0; i<num_params; i++)
00125         {
00126             mean[i]=0;
00127             variance[i]=0;
00128         }
00129 
00130 
00131         // compute mean
00132         for (i=0; i<num_vectors; i++)
00133         {
00134             int32_t len;
00135             bool free_vec;
00136             uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00137 
00138             for (int32_t j=0; j<len; j++)
00139             {
00140                 int32_t idx=compute_index(j, vec[j]);
00141                 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00142                 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00143                 float64_t value   = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00144 
00145                 mean[idx]   += value/num_vectors ;
00146             }
00147             l->free_feature_vector(vec, i, free_vec);
00148         }
00149 
00150         // compute variance
00151         for (i=0; i<num_vectors; i++)
00152         {
00153             int32_t len;
00154             bool free_vec;
00155             uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00156 
00157             for (int32_t j=0; j<len; j++)
00158             {
00159                 for (int32_t k=0; k<4; k++)
00160                 {
00161                     int32_t idx=compute_index(j, k);
00162                     if (k!=vec[j])
00163                         variance[idx]+=mean[idx]*mean[idx]/num_vectors;
00164                     else
00165                     {
00166                         float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00167                         float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00168                         float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00169 
00170                         variance[idx] += CMath::sq(value-mean[idx])/num_vectors;
00171                     }
00172                 }
00173             }
00174             l->free_feature_vector(vec, i, free_vec);
00175         }
00176 
00177 
00178         // compute sum_i m_i^2/s_i^2
00179         sum_m2_s2=0 ;
00180         for (i=0; i<num_params; i++)
00181         {
00182             if (variance[i]<1e-14) // then it is likely to be numerical inaccuracy
00183                 variance[i]=1 ;
00184 
00185             //fprintf(stderr, "%i: mean=%1.2e  std=%1.2e\n", i, mean[i], std[i]) ;
00186             sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ;
00187         } ;
00188     } 
00189 
00190     // compute sum of 
00191     //result -= feature*mean[a_idx]/variance[a_idx] ;
00192 
00193     for (i=0; i<l->get_num_vectors(); i++)
00194     {
00195         int32_t alen ;
00196         bool free_avec;
00197         uint16_t* avec=l->get_feature_vector(i, alen, free_avec);
00198         float64_t  result=0 ;
00199         for (int32_t j=0; j<alen; j++)
00200         {
00201             int32_t a_idx = compute_index(j, avec[j]) ;
00202             float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ;
00203             float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ;
00204             float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00205 
00206             if (variance[a_idx]!=0)
00207                 result-=value*mean[a_idx]/variance[a_idx];
00208         }
00209         ld_mean_lhs[i]=result ;
00210 
00211         l->free_feature_vector(avec, i, free_avec);
00212     }
00213 
00214     if (ld_mean_lhs!=ld_mean_rhs)
00215     {
00216         // compute sum of 
00217         //result -= feature*mean[b_idx]/variance[b_idx] ;
00218         for (i=0; i<r->get_num_vectors(); i++)
00219         {
00220             int32_t alen;
00221             bool free_avec;
00222             uint16_t* avec=r->get_feature_vector(i, alen, free_avec);
00223             float64_t  result=0;
00224 
00225             for (int32_t j=0; j<alen; j++)
00226             {
00227                 int32_t a_idx = compute_index(j, avec[j]) ;
00228                 float64_t theta_p=1/estimate->log_derivative_pos_obsolete(
00229                     avec[j], j) ;
00230                 float64_t theta_n=1/estimate->log_derivative_neg_obsolete(
00231                     avec[j], j) ;
00232                 float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n));
00233 
00234                 result -= value*mean[a_idx]/variance[a_idx] ;
00235             }
00236 
00237             ld_mean_rhs[i]=result;
00238             l->free_feature_vector(avec, i, free_avec);
00239         }
00240     }
00241 
00242     //warning hacky
00243     //
00244     this->lhs=l;
00245     this->rhs=l;
00246     ld_mean_lhs = l_ld_mean_lhs ;
00247     ld_mean_rhs = l_ld_mean_lhs ;
00248 
00249     //compute normalize to 1 values
00250     for (i=0; i<lhs->get_num_vectors(); i++)
00251     {
00252         sqrtdiag_lhs[i]=sqrt(compute(i,i));
00253 
00254         //trap divide by zero exception
00255         if (sqrtdiag_lhs[i]==0)
00256             sqrtdiag_lhs[i]=1e-16;
00257     }
00258 
00259     // if lhs is different from rhs (train/test data)
00260     // compute also the normalization for rhs
00261     if (sqrtdiag_lhs!=sqrtdiag_rhs)
00262     {
00263         this->lhs=r;
00264         this->rhs=r;
00265         ld_mean_lhs = l_ld_mean_rhs ;
00266         ld_mean_rhs = l_ld_mean_rhs ;
00267 
00268         //compute normalize to 1 values
00269         for (i=0; i<rhs->get_num_vectors(); i++)
00270         {
00271             sqrtdiag_rhs[i]=sqrt(compute(i,i));
00272 
00273             //trap divide by zero exception
00274             if (sqrtdiag_rhs[i]==0)
00275                 sqrtdiag_rhs[i]=1e-16;
00276         }
00277     }
00278 
00279     this->lhs=l;
00280     this->rhs=r;
00281     ld_mean_lhs = l_ld_mean_lhs ;
00282     ld_mean_rhs = l_ld_mean_rhs ;
00283 
00284     initialized = true ;
00285     return init_normalizer();
00286 }
00287 
00288 void CSalzbergWordStringKernel::cleanup()
00289 {
00290     delete[] variance;
00291     variance=NULL;
00292 
00293     delete[] mean;
00294     mean=NULL;
00295 
00296     if (sqrtdiag_lhs != sqrtdiag_rhs)
00297         delete[] sqrtdiag_rhs;
00298     sqrtdiag_rhs=NULL;
00299 
00300     delete[] sqrtdiag_lhs;
00301     sqrtdiag_lhs=NULL;
00302 
00303     if (ld_mean_lhs!=ld_mean_rhs)
00304         delete[] ld_mean_rhs ;
00305     ld_mean_rhs=NULL;
00306 
00307     delete[] ld_mean_lhs ;
00308     ld_mean_lhs=NULL;
00309 
00310     CKernel::cleanup();
00311 }
00312 
00313 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b)
00314 {
00315     int32_t alen, blen;
00316     bool free_avec, free_bvec;
00317     uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00318     uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00319     // can only deal with strings of same length
00320     ASSERT(alen==blen);
00321 
00322     float64_t result = sum_m2_s2 ; // does not contain 0-th element
00323 
00324     for (int32_t i=0; i<alen; i++)
00325     {
00326         if (avec[i]==bvec[i])
00327         {
00328             int32_t a_idx = compute_index(i, avec[i]) ;
00329 
00330             float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ;
00331             float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ;
00332             float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00333 
00334             result   += value*value/variance[a_idx] ;
00335         }
00336     }
00337     result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ;
00338 
00339     ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00340     ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00341 
00342     if (initialized)
00343         result /=  (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ;
00344 
00345     return result;
00346 }
00347 
00348 void CSalzbergWordStringKernel::set_prior_probs_from_labels(CLabels* labels)
00349 {
00350     ASSERT(labels);
00351 
00352     int32_t num_pos=0, num_neg=0;
00353     for (int32_t i=0; i<labels->get_num_labels(); i++)
00354     {
00355         if (labels->get_int_label(i)==1)
00356             num_pos++;
00357         if (labels->get_int_label(i)==-1)
00358             num_neg++;
00359     }
00360 
00361     SG_INFO("priors: pos=%1.3f (%i)  neg=%1.3f (%i)\n",
00362         (float64_t) num_pos/(num_pos+num_neg), num_pos,
00363         (float64_t) num_neg/(num_pos+num_neg), num_neg);
00364 
00365     set_prior_probs(
00366         (float64_t)num_pos/(num_pos+num_neg),
00367         (float64_t)num_neg/(num_pos+num_neg));
00368 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation