SparseFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2010 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  * Copyright (C) 2010 Berlin Institute of Technology
00011  */
00012 
00013 #ifndef _SPARSEFEATURES__H__
00014 #define _SPARSEFEATURES__H__
00015 
00016 #include <string.h>
00017 #include <stdlib.h>
00018 
00019 #include "lib/common.h"
00020 #include "lib/Mathematics.h"
00021 #include "lib/Cache.h"
00022 #include "lib/io.h"
00023 #include "lib/Cache.h"
00024 #include "lib/File.h"
00025 
00026 #include "features/Labels.h"
00027 #include "features/Features.h"
00028 #include "features/DotFeatures.h"
00029 #include "features/SimpleFeatures.h"
00030 #include "preproc/SparsePreProc.h"
00031 
00032 namespace shogun
00033 {
00034 
00035 class CFile;
00036 class CLabels;
00037 class CFeatures;
00038 class CDotFeatures;
00039 template <class ST> class CSimpleFeatures;
00040 template <class ST> class CSparsePreProc;
00041 
00042 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00043 
00044 template <class ST> struct TSparseEntry
00045 {
00047     int32_t feat_index;
00049     ST entry;
00050 };
00051 
00053 template <class ST> struct TSparse
00054 {
00055     public:
00057         int32_t vec_index;
00059         int32_t num_feat_entries;
00061         TSparseEntry<ST>* features;
00062 };
00063 #endif // DOXYGEN_SHOULD_SKIP_THIS
00064 
00077 template <class ST> class CSparseFeatures : public CDotFeatures
00078 {
00079     public:
00084         CSparseFeatures(int32_t size=0)
00085         : CDotFeatures(size), num_vectors(0), num_features(0),
00086             sparse_feature_matrix(NULL), feature_cache(NULL)
00087         {}
00088 
00097         CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00098         : CDotFeatures(0), num_vectors(0), num_features(0),
00099             sparse_feature_matrix(NULL), feature_cache(NULL)
00100         {
00101             if (!copy)
00102                 set_sparse_feature_matrix(src, num_feat, num_vec);
00103             else
00104             {
00105                 sparse_feature_matrix = new TSparse<ST>[num_vec];
00106                 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec);
00107                 for (int32_t i=0; i< num_vec; i++)
00108                 {
00109                     sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00110                     memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00111 
00112                 }
00113             }
00114         }
00115 
00123         CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00124         : CDotFeatures(0), num_vectors(0), num_features(0),
00125             sparse_feature_matrix(NULL), feature_cache(NULL)
00126         {
00127             set_full_feature_matrix(src, num_feat, num_vec);
00128         }
00129 
00131         CSparseFeatures(const CSparseFeatures & orig)
00132         : CDotFeatures(orig), num_vectors(orig.num_vectors),
00133             num_features(orig.num_features),
00134             sparse_feature_matrix(orig.sparse_feature_matrix),
00135             feature_cache(orig.feature_cache)
00136         {
00137             if (orig.sparse_feature_matrix)
00138             {
00139                 free_sparse_feature_matrix();
00140                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00141                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00142                 for (int32_t i=0; i< num_vectors; i++)
00143                 {
00144                     sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00145                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00146 
00147                 }
00148             }
00149         }
00150 
00155         CSparseFeatures(CFile* loader)
00156         : CDotFeatures(loader), num_vectors(0), num_features(0),
00157             sparse_feature_matrix(NULL), feature_cache(NULL)
00158         {
00159             load(loader);
00160         }
00161 
00163         virtual ~CSparseFeatures()
00164         {
00165             free_sparse_features();
00166         }
00167 
00171         void free_sparse_feature_matrix()
00172         {
00173             clean_tsparse(sparse_feature_matrix, num_vectors);
00174             sparse_feature_matrix = NULL;
00175             num_vectors=0;
00176             num_features=0;
00177         }
00178 
00182         void free_sparse_features()
00183         {
00184             free_sparse_feature_matrix();
00185             delete feature_cache;
00186             feature_cache = NULL;
00187         }
00188 
00193         virtual CFeatures* duplicate() const
00194         {
00195             return new CSparseFeatures<ST>(*this);
00196         }
00197 
00205         ST get_feature(int32_t num, int32_t index)
00206         {
00207             ASSERT(index>=0 && index<num_features) ;
00208             ASSERT(num>=0 && num<num_vectors) ;
00209 
00210             bool vfree;
00211             int32_t num_feat;
00212             int32_t i;
00213             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00214             ST ret = 0 ;
00215             
00216             if (sv)
00217             {
00218                 for (i=0; i<num_feat; i++)
00219                     if (sv[i].feat_index==index)
00220                         ret += sv[i].entry ;
00221             }
00222             
00223             free_sparse_feature_vector(sv, num, vfree);
00224             
00225             return ret ;
00226         }
00227         
00228 
00237         ST* get_full_feature_vector(int32_t num, int32_t& len)
00238         {
00239             bool vfree;
00240             int32_t num_feat;
00241             int32_t i;
00242             len=0;
00243             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00244             ST* fv=NULL;
00245 
00246             if (sv)
00247             {
00248                 len=num_features;
00249                 fv=new ST[num_features];
00250 
00251                 for (i=0; i<num_features; i++)
00252                     fv[i]=0;
00253 
00254                 for (i=0; i<num_feat; i++)
00255                     fv[sv[i].feat_index]= sv[i].entry;
00256             }
00257 
00258             free_sparse_feature_vector(sv, num, vfree);
00259 
00260             return fv;
00261         }
00262 
00269         void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00270         {
00271             if (num>=num_vectors)
00272             {
00273                 SG_ERROR("Index out of bounds (number of vectors %d, you "
00274                         "requested %d)\n", num_vectors, num);
00275             }
00276 
00277             bool vfree;
00278             int32_t num_feat=0;
00279             *len=0;
00280             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00281 
00282             if (sv)
00283             {
00284                 *len=num_features;
00285                 *dst= (ST*) malloc(sizeof(ST)*num_features);
00286                 memset(*dst, 0, sizeof(ST)*num_features);
00287 
00288                 for (int32_t i=0; i<num_feat; i++)
00289                     (*dst)[sv[i].feat_index]= sv[i].entry;
00290             }
00291 
00292             free_sparse_feature_vector(sv, num, vfree);
00293         }
00294 
00295 
00301         virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00302         {
00303             bool vfree;
00304             int32_t len;
00305             TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00306             free_sparse_feature_vector(sv, num, vfree);
00307             return len;
00308         }
00309 
00320         TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00321         {
00322             ASSERT(num<num_vectors);
00323 
00324             if (sparse_feature_matrix)
00325             {
00326                 len= sparse_feature_matrix[num].num_feat_entries;
00327                 vfree=false ;
00328                 return sparse_feature_matrix[num].features;
00329             } 
00330             else
00331             {
00332                 TSparseEntry<ST>* feat=NULL;
00333                 vfree=false;
00334 
00335                 if (feature_cache)
00336                 {
00337                     feat=feature_cache->lock_entry(num);
00338 
00339                     if (feat)
00340                         return feat;
00341                     else
00342                     {
00343                         feat=feature_cache->set_entry(num);
00344                     }
00345                 }
00346 
00347                 if (!feat)
00348                     vfree=true;
00349 
00350                 feat=compute_sparse_feature_vector(num, len, feat);
00351 
00352 
00353                 if (get_num_preproc())
00354                 {
00355                     int32_t tmp_len=len;
00356                     TSparseEntry<ST>* tmp_feat_before = feat;
00357                     TSparseEntry<ST>* tmp_feat_after = NULL;
00358 
00359                     for (int32_t i=0; i<get_num_preproc(); i++)
00360                     {
00361                         //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00362 
00363                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00364                             delete[] tmp_feat_before;
00365                         tmp_feat_before=tmp_feat_after;
00366                     }
00367 
00368                     memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00369                     delete[] tmp_feat_after;
00370                     len=tmp_len ;
00371                     SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00372                 }
00373                 return feat ;
00374             }
00375         }
00376 
00377 
00388         static ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00389         {
00390             ST result=0;
00391 
00392             //result remains zero when one of the vectors is non existent
00393             if (avec && bvec)
00394             {
00395                 if (alen<=blen)
00396                 {
00397                     int32_t j=0;
00398                     for (int32_t i=0; i<alen; i++)
00399                     {
00400                         int32_t a_feat_idx=avec[i].feat_index;
00401 
00402                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00403                             j++;
00404 
00405                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00406                         {
00407                             result+= avec[i].entry * bvec[j].entry;
00408                             j++;
00409                         }
00410                     }
00411                 }
00412                 else
00413                 {
00414                     int32_t j=0;
00415                     for (int32_t i=0; i<blen; i++)
00416                     {
00417                         int32_t b_feat_idx=bvec[i].feat_index;
00418 
00419                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00420                             j++;
00421 
00422                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00423                         {
00424                             result+= bvec[i].entry * avec[j].entry;
00425                             j++;
00426                         }
00427                     }
00428                 }
00429 
00430                 result*=alpha;
00431             }
00432 
00433             return result;
00434         }
00435 
00446         ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00447         {
00448             ASSERT(vec);
00449             ASSERT(dim==num_features);
00450             ST result=b;
00451 
00452             bool vfree;
00453             int32_t num_feat;
00454             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00455 
00456             if (sv)
00457             {
00458                 for (int32_t i=0; i<num_feat; i++)
00459                     result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00460             }
00461 
00462             free_sparse_feature_vector(sv, num, vfree);
00463             return result;
00464         }
00465 
00475         void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00476         {
00477             ASSERT(vec);
00478             if (dim!=num_features)
00479             {
00480                 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00481                         dim, num_features);
00482             }
00483 
00484             bool vfree;
00485             int32_t num_feat;
00486             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00487 
00488             if (sv)
00489             {
00490                 if (abs_val)
00491                 {
00492                     for (int32_t i=0; i<num_feat; i++)
00493                         vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00494                 }
00495                 else
00496                 {
00497                     for (int32_t i=0; i<num_feat; i++)
00498                         vec[sv[i].feat_index]+= alpha*sv[i].entry;
00499                 }
00500             }
00501 
00502             free_sparse_feature_vector(sv, num, vfree);
00503         }
00504 
00511         void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00512         {
00513             if (feature_cache)
00514                 feature_cache->unlock_entry(num);
00515 
00516             if (free)
00517                 delete[] feat_vec ;
00518         } 
00519 
00527         TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00528         {
00529             num_feat=num_features;
00530             num_vec=num_vectors;
00531 
00532             return sparse_feature_matrix;
00533         }
00534 
00543         void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00544                 int32_t* num_vec, int64_t* nnz)
00545         {
00546             *nnz=get_num_nonzero_entries();
00547             *num_feat=num_features;
00548             *num_vec=num_vectors;
00549             *dst=sparse_feature_matrix;
00550         }
00551 
00557         void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00558         {
00559             if (sfm)
00560             {
00561                 for (int32_t i=0; i<num_vec; i++)
00562                     delete[] sfm[i].features;
00563 
00564                 delete[] sfm;
00565             }
00566         }
00567 
00572         CSparseFeatures<ST>* get_transposed()
00573         {
00574             int32_t num_feat;
00575             int32_t num_vec;
00576             TSparse<ST>* s=get_transposed(num_feat, num_vec);
00577             return new CSparseFeatures<ST>(s, num_feat, num_vec);
00578         }
00579 
00589         TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00590         {
00591             num_feat=num_vectors;
00592             num_vec=num_features;
00593 
00594             int32_t* hist=new int32_t[num_features];
00595             memset(hist, 0, sizeof(int32_t)*num_features);
00596 
00597             // count how lengths of future feature vectors
00598             for (int32_t v=0; v<num_vectors; v++)
00599             {
00600                 int32_t vlen;
00601                 bool vfree;
00602                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00603 
00604                 for (int32_t i=0; i<vlen; i++)
00605                     hist[sv[i].feat_index]++;
00606 
00607                 free_sparse_feature_vector(sv, v, vfree);
00608             }
00609 
00610             // allocate room for future feature vectors
00611             TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00612             for (int32_t v=0; v<num_vec; v++)
00613             {
00614                 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00615                 sfm[v].num_feat_entries=hist[v];
00616                 sfm[v].vec_index=v;
00617             }
00618 
00619             // fill future feature vectors with content
00620             memset(hist,0,sizeof(int32_t)*num_features);
00621             for (int32_t v=0; v<num_vectors; v++)
00622             {
00623                 int32_t vlen;
00624                 bool vfree;
00625                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00626 
00627                 for (int32_t i=0; i<vlen; i++)
00628                 {
00629                     int32_t vidx=sv[i].feat_index;
00630                     int32_t fidx=v;
00631                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00632                     sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00633                     hist[vidx]++;
00634                 }
00635 
00636                 free_sparse_feature_vector(sv, v, vfree);
00637             }
00638 
00639             delete[] hist;
00640             return sfm;
00641         }
00642 
00652         virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00653         {
00654             free_sparse_feature_matrix();
00655 
00656             sparse_feature_matrix=src;
00657             num_features=num_feat;
00658             num_vectors=num_vec;
00659         }
00660 
00668         ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00669         {
00670             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00671             num_feat=num_features;
00672             num_vec=num_vectors;
00673 
00674             ST* fm=new ST[num_feat*num_vec];
00675 
00676             if (fm)
00677             {
00678                 for (int64_t i=0; i<num_feat*num_vec; i++)
00679                     fm[i]=0;
00680 
00681                 for (int32_t v=0; v<num_vec; v++)
00682                 {
00683                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00684                     {
00685                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00686                         fm[offs]= sparse_feature_matrix[v].features[f].entry;
00687                     }
00688                 }
00689             }
00690             else
00691                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00692 
00693             return fm;
00694         }
00695 
00703         void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00704         {
00705             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00706             *num_feat=num_features;
00707             *num_vec=num_vectors;
00708 
00709             *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00710 
00711             if (*dst)
00712             {
00713                 for (int64_t i=0; i<num_features*num_vectors; i++)
00714                     (*dst)[i]=0;
00715 
00716                 for (int32_t v=0; v<num_vectors; v++)
00717                 {
00718                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00719                     {
00720                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00721                         (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00722                     }
00723                 }
00724             }
00725             else
00726                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00727         }
00728 
00738         virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00739         {
00740             free_sparse_feature_matrix();
00741             bool result=true;
00742             num_features=num_feat;
00743             num_vectors=num_vec;
00744 
00745             SG_INFO("converting dense feature matrix to sparse one\n");
00746             int32_t* num_feat_entries=new int[num_vectors];
00747 
00748             if (num_feat_entries)
00749             {
00750                 int32_t num_total_entries=0;
00751 
00752                 // count nr of non sparse features
00753                 for (int32_t i=0; i< num_vec; i++)
00754                 {
00755                     num_feat_entries[i]=0;
00756                     for (int32_t j=0; j< num_feat; j++)
00757                     {
00758                         if (src[i*((int64_t) num_feat) + j] != 0)
00759                             num_feat_entries[i]++;
00760                     }
00761                 }
00762 
00763                 if (num_vec>0)
00764                 {
00765                     sparse_feature_matrix=new TSparse<ST>[num_vec];
00766 
00767                     if (sparse_feature_matrix)
00768                     {
00769                         for (int32_t i=0; i< num_vec; i++)
00770                         {
00771                             sparse_feature_matrix[i].vec_index=i;
00772                             sparse_feature_matrix[i].num_feat_entries=0;
00773                             sparse_feature_matrix[i].features= NULL;
00774 
00775                             if (num_feat_entries[i]>0)
00776                             {
00777                                 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00778 
00779                                 if (!sparse_feature_matrix[i].features)
00780                                 {
00781                                     SG_INFO( "allocation of features failed\n");
00782                                     return false;
00783                                 }
00784 
00785                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00786                                 int32_t sparse_feat_idx=0;
00787 
00788                                 for (int32_t j=0; j< num_feat; j++)
00789                                 {
00790                                     int64_t pos= i*num_feat + j;
00791 
00792                                     if (src[pos] != 0)
00793                                     {
00794                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00795                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00796                                         sparse_feat_idx++;
00797                                         num_total_entries++;
00798                                     }
00799                                 }
00800                             }
00801                         }
00802                     }
00803                     else
00804                     {
00805                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00806                         result=false;
00807                     }
00808 
00809                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00810                             num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00811                 }
00812                 else
00813                 {
00814                     SG_ERROR( "huh ? zero size matrix given ?\n");
00815                     result=false;
00816                 }
00817             }
00818             delete[] num_feat_entries;
00819             return result;
00820         }
00821 
00827         virtual bool apply_preproc(bool force_preprocessing=false)
00828         {
00829             SG_INFO( "force: %d\n", force_preprocessing);
00830 
00831             if ( sparse_feature_matrix && get_num_preproc() )
00832             {
00833                 for (int32_t i=0; i<get_num_preproc(); i++)
00834                 {
00835                     if ( (!is_preprocessed(i) || force_preprocessing) )
00836                     {
00837                         set_preprocessed(i);
00838                         SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00839                         if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00840                             return false;
00841                     }
00842                     return true;
00843                 }
00844                 return true;
00845             }
00846             else
00847             {
00848                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00849                 return false;
00850             }
00851         }
00852 
00857         virtual int32_t get_size() { return sizeof(ST); }
00858 
00864         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00865         {
00866             int32_t num_feat=0;
00867             int32_t num_vec=0;
00868             ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00869             ASSERT(fm && num_feat>0 && num_vec>0);
00870 
00871             return set_full_feature_matrix(fm, num_feat, num_vec);
00872         }
00873 
00878         virtual inline int32_t  get_num_vectors() { return num_vectors; }
00879 
00884         inline int32_t  get_num_features() { return num_features; }
00885 
00897         inline int32_t set_num_features(int32_t num)
00898         {
00899             int32_t n=num_features;
00900             ASSERT(n<=num);
00901             num_features=num;
00902             return num_features;
00903         }
00904 
00909         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00910 
00915         inline virtual EFeatureType get_feature_type();
00916 
00923         void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00924         {
00925             if (feature_cache)
00926                 feature_cache->unlock_entry(num);
00927 
00928             if (free)
00929                 delete[] feat_vec ;
00930         }
00931 
00936         int64_t get_num_nonzero_entries()
00937         {
00938             int64_t num=0;
00939             for (int32_t i=0; i<num_vectors; i++)
00940                 num+=sparse_feature_matrix[i].num_feat_entries;
00941 
00942             return num;
00943         }
00944 
00950         float64_t* compute_squared(float64_t* sq)
00951         {
00952             ASSERT(sq);
00953 
00954             int32_t len=0;
00955             bool do_free=false;
00956 
00957             for (int32_t i=0; i<this->get_num_vectors(); i++)
00958             {
00959                 sq[i]=0;
00960                 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00961 
00962                 for (int32_t j=0; j<len; j++)
00963                     sq[i] += vec[j].entry * vec[j].entry;
00964 
00965                 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00966             }
00967 
00968             return sq;
00969         }
00970 
00983         float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00984         {
00985             int32_t i,j;
00986             int32_t alen, blen;
00987             bool afree, bfree;
00988             ASSERT(lhs);
00989             ASSERT(rhs);
00990 
00991             TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00992             TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00993             ASSERT(avec);
00994             ASSERT(bvec);
00995 
00996             float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00997 
00998             if (alen<=blen)
00999             {
01000                 j=0;
01001                 for (i=0; i<alen; i++)
01002                 {
01003                     int32_t a_feat_idx=avec[i].feat_index;
01004 
01005                     while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
01006                         j++;
01007 
01008                     if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
01009                     {
01010                         result-=2*(avec[i].entry*bvec[j].entry);
01011                         j++;
01012                     }
01013                 }
01014             }
01015             else
01016             {
01017                 j=0;
01018                 for (i=0; i<blen; i++)
01019                 {
01020                     int32_t b_feat_idx=bvec[i].feat_index;
01021 
01022                     while ((j<alen) && (avec[j].feat_index<b_feat_idx))
01023                         j++;
01024 
01025                     if ((j<alen) && (avec[j].feat_index == b_feat_idx))
01026                     {
01027                         result-=2*(bvec[i].entry*avec[j].entry);
01028                         j++;
01029                     }
01030                 }
01031             }
01032 
01033             ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
01034             ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
01035 
01036             return CMath::abs(result);
01037         }
01038 
01043         void load(CFile* loader);
01044 
01049         void save(CFile* writer);
01050 
01058         CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01059         {
01060             CLabels* lab=NULL;
01061 
01062             size_t blocksize=1024*1024;
01063             size_t required_blocksize=blocksize;
01064             uint8_t* dummy=new uint8_t[blocksize];
01065             FILE* f=fopen(fname, "ro");
01066 
01067             if (f)
01068             {
01069                 free_sparse_feature_matrix();
01070                 num_vectors=0;
01071                 num_features=0;
01072 
01073                 SG_INFO("counting line numbers in file %s\n", fname);
01074                 size_t sz=blocksize;
01075                 size_t block_offs=0;
01076                 size_t old_block_offs=0;
01077                 fseek(f, 0, SEEK_END);
01078                 size_t fsize=ftell(f);
01079                 rewind(f);
01080 
01081                 while (sz == blocksize)
01082                 {
01083                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01084                     bool contains_cr=false;
01085                     for (size_t i=0; i<sz; i++)
01086                     {
01087                         block_offs++;
01088                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01089                         {
01090                             num_vectors++;
01091                             contains_cr=true;
01092                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01093                             old_block_offs=block_offs;
01094                         }
01095                     }
01096                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01097                 }
01098 
01099                 SG_INFO("found %d feature vectors\n", num_vectors);
01100                 delete[] dummy;
01101                 blocksize=required_blocksize;
01102                 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
01103 
01104                 lab=new CLabels(num_vectors);
01105                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01106 
01107                 rewind(f);
01108                 sz=blocksize;
01109                 int32_t lines=0;
01110                 while (sz == blocksize)
01111                 {
01112                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01113 
01114                     size_t old_sz=0;
01115                     for (size_t i=0; i<sz; i++)
01116                     {
01117                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01118                         {
01119                             size_t len=i-old_sz+1;
01120                             uint8_t* data=&dummy[old_sz];
01121 
01122                             for (int32_t j=0; j<len; j++)
01123                                 dummy[j]=data[j];
01124 
01125                             sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01126                             i=0;
01127                             old_sz=0;
01128                             sz+=len;
01129                         }
01130 
01131                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01132                         {
01133 
01134                             size_t len=i-old_sz;
01135                             uint8_t* data=&dummy[old_sz];
01136 
01137                             int32_t dims=0;
01138                             for (int32_t j=0; j<len; j++)
01139                             {
01140                                 if (data[j]==':')
01141                                     dims++;
01142                             }
01143 
01144                             if (dims<=0)
01145                             {
01146                                 SG_ERROR("Error in line %d - number of"
01147                                         " dimensions is %d line is %d characters"
01148                                         " long\n line_content:'%.*s'\n", lines,
01149                                         dims, len, len, (const char*) data);
01150                             }
01151 
01152                             TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01153                             int32_t j=0;
01154                             for (; j<len; j++)
01155                             {
01156                                 if (data[j]==' ')
01157                                 {
01158                                     data[j]='\0';
01159 
01160                                     lab->set_label(lines, atof((const char*) data));
01161                                     break;
01162                                 }
01163                             }
01164 
01165                             int32_t d=0;
01166                             j++;
01167                             uint8_t* start=&data[j];
01168                             for (; j<len; j++)
01169                             {
01170                                 if (data[j]==':')
01171                                 {
01172                                     data[j]='\0';
01173 
01174                                     feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01175                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
01176 
01177                                     j++;
01178                                     start=&data[j];
01179                                     for (; j<len; j++)
01180                                     {
01181                                         if (data[j]==' ' || data[j]=='\n')
01182                                         {
01183                                             data[j]='\0';
01184                                             feat[d].entry=(ST) atof((const char*) start);
01185                                             d++;
01186                                             break;
01187                                         }
01188                                     }
01189 
01190                                     if (j==len)
01191                                     {
01192                                         data[j]='\0';
01193                                         feat[dims-1].entry=(ST) atof((const char*) start);
01194                                     }
01195 
01196                                     j++;
01197                                     start=&data[j];
01198                                 }
01199                             }
01200 
01201                             sparse_feature_matrix[lines].vec_index=lines;
01202                             sparse_feature_matrix[lines].num_feat_entries=dims;
01203                             sparse_feature_matrix[lines].features=feat;
01204 
01205                             old_sz=i+1;
01206                             lines++;
01207                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01208                         }
01209                     }
01210                 }
01211                 SG_INFO("file successfully read\n");
01212                 fclose(f);
01213             }
01214 
01215             delete[] dummy;
01216 
01217             if (do_sort_features)
01218                 sort_features();
01219 
01220             return lab;
01221         }
01222 
01225         void sort_features()
01226         {
01227             ASSERT(get_num_preproc()==0);
01228 
01229             if (!sparse_feature_matrix)
01230                 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01231 
01232             for (int32_t i=0; i<num_vectors; i++)
01233             {
01234                 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01235 
01236                 if (!len)
01237                     continue;
01238 
01239                 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01240                 int32_t* feat_idx=new int32_t[len];
01241                 int32_t* orig_idx=new int32_t[len];
01242 
01243                 for (int j=0; j<len; j++)
01244                 {
01245                     feat_idx[j]=sf_orig[j].feat_index;
01246                     orig_idx[j]=j;
01247                 }
01248 
01249                 CMath::qsort_index(feat_idx, orig_idx, len);
01250 
01251                 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len];
01252                 for (int j=0; j<len; j++)
01253                     sf_new[j]=sf_orig[orig_idx[j]];
01254 
01255                 sparse_feature_matrix[i].features=sf_new;
01256 
01257                 // sanity check
01258                 for (int j=0; j<len-1; j++)
01259                     ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01260 
01261                 delete[] orig_idx;
01262                 delete[] feat_idx;
01263                 delete[] sf_orig;
01264             }
01265         }
01266 
01273         bool write_svmlight_file(char* fname, CLabels* label)
01274         {
01275             ASSERT(label);
01276             int32_t num=label->get_num_labels();
01277             ASSERT(num>0);
01278             ASSERT(num==num_vectors);
01279 
01280             FILE* f=fopen(fname, "wb");
01281 
01282             if (f)
01283             {
01284                 for (int32_t i=0; i<num; i++)
01285                 {
01286                     fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01287 
01288                     TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01289                     int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01290 
01291                     for (int32_t j=0; j<num_feat; j++)
01292                     {
01293                         if (j<num_feat-1)
01294                             fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01295                         else
01296                             fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01297                     }
01298                 }
01299 
01300                 fclose(f);
01301                 return true;
01302             }
01303             return false;
01304         }
01305 
01313         virtual int32_t get_dim_feature_space()
01314         {
01315             return num_features;
01316         }
01317 
01324         virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2)
01325         {
01326             bool afree, bfree;
01327             int32_t alen, blen;
01328             TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01329             TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree);
01330 
01331             float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01332 
01333             free_sparse_feature_vector(avec, vec_idx1, afree);
01334             free_sparse_feature_vector(bvec, vec_idx2, bfree);
01335 
01336             return result;
01337         }
01338 
01345         virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01346         {
01347             ASSERT(vec2);
01348             if (vec2_len!=num_features)
01349             {
01350                 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01351                         vec2_len, num_features);
01352             }
01353             float64_t result=0;
01354 
01355             bool vfree;
01356             int32_t vlen;
01357             TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01358 
01359             if (sv)
01360             {
01361                 for (int32_t i=0; i<vlen; i++)
01362                     result+=vec2[sv[i].feat_index]*sv[i].entry;
01363             }
01364 
01365             free_sparse_feature_vector(sv, vec_idx1, vfree);
01366 
01367             return result;
01368         }
01369 
01371         struct sparse_feature_iterator
01372         {
01374             TSparseEntry<ST>* sv;
01376             int32_t vidx;
01378             int32_t num_feat_entries;
01380             bool vfree;
01381 
01383             int32_t index;
01384 
01386             void print_info()
01387             {
01388                 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n",
01389                         sv, vidx, num_feat_entries, index);
01390             }
01391         };
01392 
01402         virtual void* get_feature_iterator(int32_t vector_index)
01403         {
01404             if (vector_index>=num_vectors)
01405             {
01406                 SG_ERROR("Index out of bounds (number of vectors %d, you "
01407                         "requested %d)\n", num_vectors, vector_index);
01408             }
01409 
01410             if (!sparse_feature_matrix)
01411                 SG_ERROR("Requires a in-memory feature matrix\n");
01412 
01413             sparse_feature_iterator* it=new sparse_feature_iterator[1];
01414             it->sv=get_sparse_feature_vector(vector_index, it->num_feat_entries, it->vfree);
01415             it->vidx=vector_index;
01416             it->index=0;
01417 
01418             return it;
01419         }
01420 
01431         virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator)
01432         {
01433             sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01434             if (!it || it->index >= it->num_feat_entries)
01435                 return false;
01436 
01437             int32_t i=it->index++;
01438 
01439             index =  it->sv[i].feat_index;
01440             value = (float64_t) it->sv[i].entry;
01441 
01442             return true;
01443         }
01444 
01450         virtual void free_feature_iterator(void* iterator)
01451         {
01452             if (!iterator)
01453                 return;
01454 
01455             sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01456             free_sparse_feature_vector(it->sv, it->vidx, it->vfree);
01457             delete[] it;
01458         }
01459 
01461         inline virtual const char* get_name() const { return "SparseFeatures"; }
01462 
01463     protected:
01474         virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01475         {
01476             len=0;
01477             return NULL;
01478         }
01479 
01480     protected:
01481 
01483         int32_t num_vectors;
01484 
01486         int32_t num_features;
01487 
01489         TSparse<ST>* sparse_feature_matrix;
01490 
01492         CCache< TSparseEntry<ST> >* feature_cache;
01493 };
01494 
01495 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01496 #define GET_FEATURE_TYPE(sg_type, f_type)                                   \
01497 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01498 {                                                                           \
01499     return f_type;                                                          \
01500 }
01501 GET_FEATURE_TYPE(bool, F_BOOL)
01502 GET_FEATURE_TYPE(char, F_CHAR)
01503 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01504 GET_FEATURE_TYPE(int16_t, F_SHORT)
01505 GET_FEATURE_TYPE(uint16_t, F_WORD)
01506 GET_FEATURE_TYPE(int32_t, F_INT)
01507 GET_FEATURE_TYPE(uint32_t, F_UINT)
01508 GET_FEATURE_TYPE(int64_t, F_LONG)
01509 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01510 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01511 GET_FEATURE_TYPE(float64_t, F_DREAL)
01512 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01513 #undef GET_FEATURE_TYPE
01514 
01515 #define LOAD(fname, sg_type)                                            \
01516 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader)    \
01517 {                                                                       \
01518     ASSERT(loader);                                                     \
01519     TSparse<sg_type>* matrix=NULL;                                      \
01520     int32_t num_feat=0;                                                 \
01521     int32_t num_vec=0;                                                  \
01522     loader->fname(matrix, num_feat, num_vec);           \
01523     set_sparse_feature_matrix(matrix, num_feat, num_vec);               \
01524 }
01525 LOAD(get_bool_sparsematrix, bool)
01526 LOAD(get_char_sparsematrix, char)
01527 LOAD(get_byte_sparsematrix, uint8_t)
01528 LOAD(get_short_sparsematrix, int16_t)
01529 LOAD(get_word_sparsematrix, uint16_t)
01530 LOAD(get_int_sparsematrix, int32_t)
01531 LOAD(get_uint_sparsematrix, uint32_t)
01532 LOAD(get_long_sparsematrix, int64_t)
01533 LOAD(get_ulong_sparsematrix, uint64_t)
01534 LOAD(get_shortreal_sparsematrix, float32_t)
01535 LOAD(get_real_sparsematrix, float64_t)
01536 LOAD(get_longreal_sparsematrix, floatmax_t)
01537 #undef LOAD
01538 
01539 #define WRITE(fname, sg_type)                                           \
01540 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer)    \
01541 {                                                                       \
01542     ASSERT(writer);                                                     \
01543     writer->fname(sparse_feature_matrix, num_features, num_vectors);    \
01544 }
01545 WRITE(set_bool_sparsematrix, bool)
01546 WRITE(set_char_sparsematrix, char)
01547 WRITE(set_byte_sparsematrix, uint8_t)
01548 WRITE(set_short_sparsematrix, int16_t)
01549 WRITE(set_word_sparsematrix, uint16_t)
01550 WRITE(set_int_sparsematrix, int32_t)
01551 WRITE(set_uint_sparsematrix, uint32_t)
01552 WRITE(set_long_sparsematrix, int64_t)
01553 WRITE(set_ulong_sparsematrix, uint64_t)
01554 WRITE(set_shortreal_sparsematrix, float32_t)
01555 WRITE(set_real_sparsematrix, float64_t)
01556 WRITE(set_longreal_sparsematrix, floatmax_t)
01557 #undef WRITE
01558 #endif // DOXYGEN_SHOULD_SKIP_THIS
01559 }
01560 #endif /* _SPARSEFEATURES__H__ */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation