SparseFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023 
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/DotFeatures.h"
00027 #include "features/SimpleFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029 
00030 template <class ST> class CSparsePreProc;
00031 
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033 
00034 template <class ST> struct TSparseEntry
00035 {
00037     int32_t feat_index;
00039     ST entry;
00040 };
00041 
00043 template <class ST> struct TSparse
00044 {
00045     public:
00047         int32_t vec_index;
00049         int32_t num_feat_entries;
00051         TSparseEntry<ST>* features;
00052 };
00053 #endif // DOXYGEN_SHOULD_SKIP_THIS
00054 
00067 template <class ST> class CSparseFeatures : public CDotFeatures
00068 {
00069     public:
00074         CSparseFeatures(int32_t size=0)
00075         : CDotFeatures(size), num_vectors(0), num_features(0),
00076             sparse_feature_matrix(NULL), feature_cache(NULL)
00077         {}
00078 
00086         CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00087         : CDotFeatures(0), num_vectors(0), num_features(0),
00088             sparse_feature_matrix(NULL), feature_cache(NULL)
00089         {
00090             set_sparse_feature_matrix(src, num_feat, num_vec);
00091         }
00092 
00100         CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00101         : CDotFeatures(0), num_vectors(0), num_features(0),
00102             sparse_feature_matrix(NULL), feature_cache(NULL)
00103         {
00104             set_full_feature_matrix(src, num_feat, num_vec);
00105         }
00106 
00108         CSparseFeatures(const CSparseFeatures & orig)
00109         : CDotFeatures(orig), num_vectors(orig.num_vectors),
00110             num_features(orig.num_features),
00111             sparse_feature_matrix(orig.sparse_feature_matrix),
00112             feature_cache(orig.feature_cache)
00113         {
00114             if (orig.sparse_feature_matrix)
00115             {
00116                 free_sparse_feature_matrix();
00117                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00118                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00119                 for (int32_t i=0; i< num_vectors; i++)
00120                 {
00121                     sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00122                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00123 
00124                 }
00125             }
00126         }
00127 
00132         CSparseFeatures(char* fname)
00133         : CDotFeatures(fname), num_vectors(0), num_features(0),
00134             sparse_feature_matrix(NULL), feature_cache(NULL)
00135         {}
00136 
00137         virtual ~CSparseFeatures()
00138         {
00139             free_sparse_features();
00140         }
00141 
00145         void free_sparse_feature_matrix()
00146         {
00147             clean_tsparse(sparse_feature_matrix, num_vectors);
00148             sparse_feature_matrix = NULL;
00149             num_vectors=0;
00150             num_features=0;
00151         }
00152 
00156         void free_sparse_features()
00157         {
00158             free_sparse_feature_matrix();
00159             delete feature_cache;
00160             feature_cache = NULL;
00161         }
00162 
00167         virtual CFeatures* duplicate() const
00168         {
00169             return new CSparseFeatures<ST>(*this);
00170         }
00171 
00180         ST* get_full_feature_vector(int32_t num, int32_t& len)
00181         {
00182             bool vfree;
00183             int32_t num_feat;
00184             int32_t i;
00185             len=0;
00186             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00187             ST* fv=NULL;
00188 
00189             if (sv)
00190             {
00191                 len=num_features;
00192                 fv=new ST[num_features];
00193 
00194                 for (i=0; i<num_features; i++)
00195                     fv[i]=0;
00196 
00197                 for (i=0; i<num_feat; i++)
00198                     fv[sv[i].feat_index]= sv[i].entry;
00199             }
00200 
00201             free_sparse_feature_vector(sv, num, vfree);
00202 
00203             return fv;
00204         }
00205 
00212         void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00213         {
00214             if (num>=num_vectors)
00215             {
00216                 SG_ERROR("Index out of bounds (number of vectors %d, you "
00217                         "requested %d)\n", num_vectors, num);
00218             }
00219 
00220             bool vfree;
00221             int32_t num_feat=0;
00222             *len=0;
00223             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00224 
00225             if (sv)
00226             {
00227                 *len=num_features;
00228                 *dst= (ST*) malloc(sizeof(ST)*num_features);
00229                 memset(*dst, 0, sizeof(ST)*num_features);
00230 
00231                 for (int32_t i=0; i<num_feat; i++)
00232                     (*dst)[sv[i].feat_index]= sv[i].entry;
00233             }
00234 
00235             free_sparse_feature_vector(sv, num, vfree);
00236         }
00237 
00238 
00244         virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00245         {
00246             bool vfree;
00247             int32_t len;
00248             TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00249             free_sparse_feature_vector(sv, num, vfree);
00250             return len;
00251         }
00252 
00263         TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00264         {
00265             ASSERT(num<num_vectors);
00266 
00267             if (sparse_feature_matrix)
00268             {
00269                 len= sparse_feature_matrix[num].num_feat_entries;
00270                 vfree=false ;
00271                 return sparse_feature_matrix[num].features;
00272             } 
00273             else
00274             {
00275                 TSparseEntry<ST>* feat=NULL;
00276                 vfree=false;
00277 
00278                 if (feature_cache)
00279                 {
00280                     feat=feature_cache->lock_entry(num);
00281 
00282                     if (feat)
00283                         return feat;
00284                     else
00285                     {
00286                         feat=feature_cache->set_entry(num);
00287                     }
00288                 }
00289 
00290                 if (!feat)
00291                     vfree=true;
00292 
00293                 feat=compute_sparse_feature_vector(num, len, feat);
00294 
00295 
00296                 if (get_num_preproc())
00297                 {
00298                     int32_t tmp_len=len;
00299                     TSparseEntry<ST>* tmp_feat_before = feat;
00300                     TSparseEntry<ST>* tmp_feat_after = NULL;
00301 
00302                     for (int32_t i=0; i<get_num_preproc(); i++)
00303                     {
00304                         //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00305 
00306                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00307                             delete[] tmp_feat_before;
00308                         tmp_feat_before=tmp_feat_after;
00309                     }
00310 
00311                     memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00312                     delete[] tmp_feat_after;
00313                     len=tmp_len ;
00314                     SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00315                 }
00316                 return feat ;
00317             }
00318         }
00319 
00320 
00331         ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00332         {
00333             ST result=0;
00334 
00335             //result remains zero when one of the vectors is non existent
00336             if (avec && bvec)
00337             {
00338                 if (alen<=blen)
00339                 {
00340                     int32_t j=0;
00341                     for (int32_t i=0; i<alen; i++)
00342                     {
00343                         int32_t a_feat_idx=avec[i].feat_index;
00344 
00345                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00346                             j++;
00347 
00348                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00349                         {
00350                             result+= avec[i].entry * bvec[j].entry;
00351                             j++;
00352                         }
00353                     }
00354                 }
00355                 else
00356                 {
00357                     int32_t j=0;
00358                     for (int32_t i=0; i<blen; i++)
00359                     {
00360                         int32_t b_feat_idx=bvec[i].feat_index;
00361 
00362                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00363                             j++;
00364 
00365                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00366                         {
00367                             result+= bvec[i].entry * avec[j].entry;
00368                             j++;
00369                         }
00370                     }
00371                 }
00372 
00373                 result*=alpha;
00374             }
00375 
00376             return result;
00377         }
00378 
00389         ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00390         {
00391             ASSERT(vec);
00392             ASSERT(dim==num_features);
00393             ST result=b;
00394 
00395             bool vfree;
00396             int32_t num_feat;
00397             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00398 
00399             if (sv)
00400             {
00401                 for (int32_t i=0; i<num_feat; i++)
00402                     result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00403             }
00404 
00405             free_sparse_feature_vector(sv, num, vfree);
00406             return result;
00407         }
00408 
00418         void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00419         {
00420             ASSERT(vec);
00421             ASSERT(dim==num_features);
00422 
00423             bool vfree;
00424             int32_t num_feat;
00425             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00426 
00427             if (sv)
00428             {
00429                 if (abs_val)
00430                 {
00431                     for (int32_t i=0; i<num_feat; i++)
00432                         vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00433                 }
00434                 else
00435                 {
00436                     for (int32_t i=0; i<num_feat; i++)
00437                         vec[sv[i].feat_index]+= alpha*sv[i].entry;
00438                 }
00439             }
00440 
00441             free_sparse_feature_vector(sv, num, vfree);
00442         }
00443 
00450         void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00451         {
00452             if (feature_cache)
00453                 feature_cache->unlock_entry(num);
00454 
00455             if (free)
00456                 delete[] feat_vec ;
00457         } 
00458 
00466         TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00467         {
00468             num_feat=num_features;
00469             num_vec=num_vectors;
00470 
00471             return sparse_feature_matrix;
00472         }
00473 
00482         void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00483                 int32_t* num_vec, int64_t* nnz)
00484         {
00485             *nnz=get_num_nonzero_entries();
00486             *num_feat=num_features;
00487             *num_vec=num_vectors;
00488             *dst=sparse_feature_matrix;
00489         }
00490 
00496         void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00497         {
00498             if (sfm)
00499             {
00500                 for (int32_t i=0; i<num_vec; i++)
00501                     delete[] sfm[i].features;
00502 
00503                 delete[] sfm;
00504             }
00505         }
00506 
00516         TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00517         {
00518             num_feat=num_vectors;
00519             num_vec=num_features;
00520 
00521             int32_t* hist=new int32_t[num_features];
00522             memset(hist, 0, sizeof(int32_t)*num_features);
00523 
00524             // count how lengths of future feature vectors
00525             for (int32_t v=0; v<num_vectors; v++)
00526             {
00527                 int32_t vlen;
00528                 bool vfree;
00529                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00530 
00531                 for (int32_t i=0; i<vlen; i++)
00532                     hist[sv[i].feat_index]++;
00533 
00534                 free_sparse_feature_vector(sv, v, vfree);
00535             }
00536 
00537             // allocate room for future feature vectors
00538             TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00539             for (int32_t v=0; v<num_vec; v++)
00540             {
00541                 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00542                 sfm[v].num_feat_entries=hist[v];
00543                 sfm[v].vec_index=v;
00544             }
00545 
00546             // fill future feature vectors with content
00547             memset(hist,0,sizeof(int32_t)*num_features);
00548             for (int32_t v=0; v<num_vectors; v++)
00549             {
00550                 int32_t vlen;
00551                 bool vfree;
00552                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00553 
00554                 for (int32_t i=0; i<vlen; i++)
00555                 {
00556                     int32_t vidx=sv[i].feat_index;
00557                     int32_t fidx=v;
00558                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00559                     sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00560                     hist[vidx]++;
00561                 }
00562 
00563                 free_sparse_feature_vector(sv, v, vfree);
00564             }
00565 
00566             delete[] hist;
00567             return sfm;
00568         }
00569 
00579         virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00580         {
00581             free_sparse_feature_matrix();
00582 
00583             sparse_feature_matrix=src;
00584             num_features=num_feat;
00585             num_vectors=num_vec;
00586         }
00587 
00595         ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00596         {
00597             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00598             num_feat=num_features;
00599             num_vec=num_vectors;
00600 
00601             ST* fm=new ST[num_feat*num_vec];
00602 
00603             if (fm)
00604             {
00605                 for (int64_t i=0; i<num_feat*num_vec; i++)
00606                     fm[i]=0;
00607 
00608                 for (int32_t v=0; v<num_vec; v++)
00609                 {
00610                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00611                     {
00612                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00613                         fm[offs]= sparse_feature_matrix[v].features[f].entry;
00614                     }
00615                 }
00616             }
00617             else
00618                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00619 
00620             return fm;
00621         }
00622 
00630         void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00631         {
00632             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00633             *num_feat=num_features;
00634             *num_vec=num_vectors;
00635 
00636             *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00637 
00638             if (*dst)
00639             {
00640                 for (int64_t i=0; i<num_features*num_vectors; i++)
00641                     (*dst)[i]=0;
00642 
00643                 for (int32_t v=0; v<num_vectors; v++)
00644                 {
00645                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00646                     {
00647                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00648                         (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00649                     }
00650                 }
00651             }
00652             else
00653                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00654         }
00655 
00665         virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00666         {
00667             free_sparse_feature_matrix();
00668             bool result=true;
00669             num_features=num_feat;
00670             num_vectors=num_vec;
00671 
00672             SG_INFO("converting dense feature matrix to sparse one\n");
00673             int32_t* num_feat_entries=new int[num_vectors];
00674 
00675             if (num_feat_entries)
00676             {
00677                 int32_t num_total_entries=0;
00678 
00679                 // count nr of non sparse features
00680                 for (int32_t i=0; i< num_vec; i++)
00681                 {
00682                     num_feat_entries[i]=0;
00683                     for (int32_t j=0; j< num_feat; j++)
00684                     {
00685                         if (src[i*((int64_t) num_feat) + j] != 0)
00686                             num_feat_entries[i]++;
00687                     }
00688                 }
00689 
00690                 if (num_vec>0)
00691                 {
00692                     sparse_feature_matrix=new TSparse<ST>[num_vec];
00693 
00694                     if (sparse_feature_matrix)
00695                     {
00696                         for (int32_t i=0; i< num_vec; i++)
00697                         {
00698                             sparse_feature_matrix[i].vec_index=i;
00699                             sparse_feature_matrix[i].num_feat_entries=0;
00700                             sparse_feature_matrix[i].features= NULL;
00701 
00702                             if (num_feat_entries[i]>0)
00703                             {
00704                                 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00705 
00706                                 if (!sparse_feature_matrix[i].features)
00707                                 {
00708                                     SG_INFO( "allocation of features failed\n");
00709                                     return false;
00710                                 }
00711 
00712                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00713                                 int32_t sparse_feat_idx=0;
00714 
00715                                 for (int32_t j=0; j< num_feat; j++)
00716                                 {
00717                                     int64_t pos= i*num_feat + j;
00718 
00719                                     if (src[pos] != 0)
00720                                     {
00721                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00722                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00723                                         sparse_feat_idx++;
00724                                         num_total_entries++;
00725                                     }
00726                                 }
00727                             }
00728                         }
00729                     }
00730                     else
00731                     {
00732                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00733                         result=false;
00734                     }
00735 
00736                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00737                             num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00738                 }
00739                 else
00740                 {
00741                     SG_ERROR( "huh ? zero size matrix given ?\n");
00742                     result=false;
00743                 }
00744             }
00745             delete[] num_feat_entries;
00746             return result;
00747         }
00748 
00754         virtual bool apply_preproc(bool force_preprocessing=false)
00755         {
00756             SG_INFO( "force: %d\n", force_preprocessing);
00757 
00758             if ( sparse_feature_matrix && get_num_preproc() )
00759             {
00760                 for (int32_t i=0; i<get_num_preproc(); i++)
00761                 {
00762                     if ( (!is_preprocessed(i) || force_preprocessing) )
00763                     {
00764                         set_preprocessed(i);
00765                         SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00766                         if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00767                             return false;
00768                     }
00769                     return true;
00770                 }
00771                 return true;
00772             }
00773             else
00774             {
00775                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00776                 return false;
00777             }
00778         }
00779 
00784         virtual int32_t get_size() { return sizeof(ST); }
00785 
00791         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00792         {
00793             int32_t num_feat=0;
00794             int32_t num_vec=0;
00795             ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00796             ASSERT(fm && num_feat>0 && num_vec>0);
00797 
00798             return set_full_feature_matrix(fm, num_feat, num_vec);
00799         }
00800 
00805         virtual inline int32_t  get_num_vectors() { return num_vectors; }
00806 
00811         inline int32_t  get_num_features() { return num_features; }
00812 
00824         inline int32_t set_num_features(int32_t num)
00825         {
00826             int32_t n=num_features;
00827             ASSERT(n<=num);
00828             num_features=num;
00829             return num_features;
00830         }
00831 
00836         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00837 
00842         inline virtual EFeatureType get_feature_type();
00843 
00850         void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00851         {
00852             if (feature_cache)
00853                 feature_cache->unlock_entry(num);
00854 
00855             if (free)
00856                 delete[] feat_vec ;
00857         }
00858 
00863         int64_t get_num_nonzero_entries()
00864         {
00865             int64_t num=0;
00866             for (int32_t i=0; i<num_vectors; i++)
00867                 num+=sparse_feature_matrix[i].num_feat_entries;
00868 
00869             return num;
00870         }
00871 
00877         float64_t* compute_squared(float64_t* sq)
00878         {
00879             ASSERT(sq);
00880 
00881             int32_t len=0;
00882             bool do_free=false;
00883 
00884             for (int32_t i=0; i<this->get_num_vectors(); i++)
00885             {
00886                 sq[i]=0;
00887                 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00888 
00889                 for (int32_t j=0; j<len; j++)
00890                     sq[i] += vec[j].entry * vec[j].entry;
00891 
00892                 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00893             }
00894 
00895             return sq;
00896         }
00897 
00910         float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00911         {
00912             int32_t i,j;
00913             int32_t alen, blen;
00914             bool afree, bfree;
00915             ASSERT(lhs);
00916             ASSERT(rhs);
00917 
00918             TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00919             TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00920             ASSERT(avec);
00921             ASSERT(bvec);
00922 
00923             float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00924 
00925             if (alen<=blen)
00926             {
00927                 j=0;
00928                 for (i=0; i<alen; i++)
00929                 {
00930                     int32_t a_feat_idx=avec[i].feat_index;
00931 
00932                     while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00933                         j++;
00934 
00935                     if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00936                     {
00937                         result-=2*(avec[i].entry*bvec[j].entry);
00938                         j++;
00939                     }
00940                 }
00941             }
00942             else
00943             {
00944                 j=0;
00945                 for (i=0; i<blen; i++)
00946                 {
00947                     int32_t b_feat_idx=bvec[i].feat_index;
00948 
00949                     while ((j<alen) && (avec[j].feat_index<b_feat_idx))
00950                         j++;
00951 
00952                     if ((j<alen) && (avec[j].feat_index == b_feat_idx))
00953                     {
00954                         result-=2*(bvec[i].entry*avec[j].entry);
00955                         j++;
00956                     }
00957                 }
00958             }
00959 
00960             ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
00961             ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00962 
00963             return CMath::abs(result);
00964         }
00965 
00971         CLabels* load_svmlight_file(char* fname)
00972         {
00973             CLabels* lab=NULL;
00974 
00975             size_t blocksize=1024*1024;
00976             size_t required_blocksize=blocksize;
00977             uint8_t* dummy=new uint8_t[blocksize];
00978             FILE* f=fopen(fname, "ro");
00979 
00980             if (f)
00981             {
00982                 free_sparse_feature_matrix();
00983                 num_vectors=0;
00984                 num_features=0;
00985 
00986                 SG_INFO("counting line numbers in file %s\n", fname);
00987                 size_t sz=blocksize;
00988                 size_t block_offs=0;
00989                 size_t old_block_offs=0;
00990                 fseek(f, 0, SEEK_END);
00991                 size_t fsize=ftell(f);
00992                 rewind(f);
00993 
00994                 while (sz == blocksize)
00995                 {
00996                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00997                     bool contains_cr=false;
00998                     for (size_t i=0; i<sz; i++)
00999                     {
01000                         block_offs++;
01001                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01002                         {
01003                             num_vectors++;
01004                             contains_cr=true;
01005                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01006                             old_block_offs=block_offs;
01007                         }
01008                     }
01009                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01010                 }
01011 
01012                 SG_INFO("found %d feature vectors\n", num_vectors);
01013                 delete[] dummy;
01014                 blocksize=required_blocksize;
01015                 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
01016 
01017                 lab=new CLabels(num_vectors);
01018                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01019 
01020                 rewind(f);
01021                 sz=blocksize;
01022                 int32_t lines=0;
01023                 while (sz == blocksize)
01024                 {
01025                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01026 
01027                     size_t old_sz=0;
01028                     for (size_t i=0; i<sz; i++)
01029                     {
01030                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01031                         {
01032                             size_t len=i-old_sz+1;
01033                             uint8_t* data=&dummy[old_sz];
01034 
01035                             for (int32_t j=0; j<len; j++)
01036                                 dummy[j]=data[j];
01037 
01038                             sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01039                             i=0;
01040                             old_sz=0;
01041                             sz+=len;
01042                         }
01043 
01044                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01045                         {
01046 
01047                             size_t len=i-old_sz;
01048                             uint8_t* data=&dummy[old_sz];
01049 
01050                             int32_t dims=0;
01051                             for (int32_t j=0; j<len; j++)
01052                             {
01053                                 if (data[j]==':')
01054                                     dims++;
01055                             }
01056 
01057                             if (dims<=0)
01058                             {
01059                                 SG_ERROR("Error in line %d - number of"
01060                                         " dimensions is %d line is %d characters"
01061                                         " long\n line_content:'%.*s'\n", lines,
01062                                         dims, len, len, (const char*) data);
01063                             }
01064 
01065                             TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01066                             int32_t j=0;
01067                             for (; j<len; j++)
01068                             {
01069                                 if (data[j]==' ')
01070                                 {
01071                                     data[j]='\0';
01072 
01073                                     lab->set_label(lines, atof((const char*) data));
01074                                     break;
01075                                 }
01076                             }
01077 
01078                             int32_t d=0;
01079                             j++;
01080                             uint8_t* start=&data[j];
01081                             for (; j<len; j++)
01082                             {
01083                                 if (data[j]==':')
01084                                 {
01085                                     data[j]='\0';
01086 
01087                                     feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01088                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
01089 
01090                                     j++;
01091                                     start=&data[j];
01092                                     for (; j<len; j++)
01093                                     {
01094                                         if (data[j]==' ' || data[j]=='\n')
01095                                         {
01096                                             data[j]='\0';
01097                                             feat[d].entry=(ST) atof((const char*) start);
01098                                             d++;
01099                                             break;
01100                                         }
01101                                     }
01102 
01103                                     if (j==len)
01104                                     {
01105                                         data[j]='\0';
01106                                         feat[dims-1].entry=(ST) atof((const char*) start);
01107                                     }
01108 
01109                                     j++;
01110                                     start=&data[j];
01111                                 }
01112                             }
01113 
01114                             sparse_feature_matrix[lines].vec_index=lines;
01115                             sparse_feature_matrix[lines].num_feat_entries=dims;
01116                             sparse_feature_matrix[lines].features=feat;
01117 
01118                             old_sz=i+1;
01119                             lines++;
01120                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01121                         }
01122                     }
01123                 }
01124                 SG_INFO("file successfully read\n");
01125                 fclose(f);
01126             }
01127 
01128             delete[] dummy;
01129 
01130             return lab;
01131         }
01132 
01139         bool write_svmlight_file(char* fname, CLabels* label)
01140         {
01141             ASSERT(label);
01142             int32_t num=label->get_num_labels();
01143             ASSERT(num>0);
01144             ASSERT(num==num_vectors);
01145 
01146             FILE* f=fopen(fname, "wb");
01147 
01148             if (f)
01149             {
01150                 for (int32_t i=0; i<num; i++)
01151                 {
01152                     fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01153 
01154                     TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01155                     int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01156 
01157                     for (int32_t j=0; j<num_feat; j++)
01158                     {
01159                         if (j<num_feat-1)
01160                             fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01161                         else
01162                             fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01163                     }
01164                 }
01165 
01166                 fclose(f);
01167                 return true;
01168             }
01169             return false;
01170         }
01171 
01179         virtual int32_t get_dim_feature_space()
01180         {
01181             return num_features;
01182         }
01183 
01190         virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2)
01191         {
01192             bool afree, bfree;
01193             int32_t alen, blen;
01194             TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01195             TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree);
01196 
01197             float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01198 
01199             free_sparse_feature_vector(avec, vec_idx1, afree);
01200             free_sparse_feature_vector(bvec, vec_idx2, bfree);
01201 
01202             return result;
01203         }
01204 
01211         virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01212         {
01213             ASSERT(vec2);
01214             ASSERT(vec2_len==num_features);
01215             float64_t result=0;
01216 
01217             bool vfree;
01218             int32_t vlen;
01219             TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01220 
01221             if (sv)
01222             {
01223                 for (int32_t i=0; i<vlen; i++)
01224                     result+=vec2[sv[i].feat_index]*sv[i].entry;
01225             }
01226 
01227             free_sparse_feature_vector(sv, vec_idx1, vfree);
01228 
01229             return result;
01230         }
01231 
01233         inline virtual const char* get_name() const { return "SparseFeatures"; }
01234 
01235     protected:
01246         virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01247         {
01248             len=0;
01249             return NULL;
01250         }
01251 
01252     protected:
01253 
01255         int32_t num_vectors;
01256 
01258         int32_t num_features;
01259 
01261         TSparse<ST>* sparse_feature_matrix;
01262 
01264         CCache< TSparseEntry<ST> >* feature_cache;
01265 };
01266 
01271 template<> inline EFeatureType CSparseFeatures<bool>::get_feature_type()
01272 {
01273     return F_BOOL;
01274 }
01275 
01280 template<> inline EFeatureType CSparseFeatures<char>::get_feature_type()
01281 {
01282     return F_CHAR;
01283 }
01284 
01289 template<> inline EFeatureType CSparseFeatures<uint8_t>::get_feature_type()
01290 {
01291     return F_BYTE;
01292 }
01293 
01298 template<> inline EFeatureType CSparseFeatures<int16_t>::get_feature_type()
01299 {
01300     return F_SHORT;
01301 }
01302 
01307 template<> inline EFeatureType CSparseFeatures<uint16_t>::get_feature_type()
01308 {
01309     return F_WORD;
01310 }
01311 
01316 template<> inline EFeatureType CSparseFeatures<int32_t>::get_feature_type()
01317 {
01318     return F_INT;
01319 }
01320 
01325 template<> inline EFeatureType CSparseFeatures<uint32_t>::get_feature_type()
01326 {
01327     return F_UINT;
01328 }
01329 
01334 template<> inline EFeatureType CSparseFeatures<int64_t>::get_feature_type()
01335 {
01336     return F_LONG;
01337 }
01338 
01343 template<> inline EFeatureType CSparseFeatures<uint64_t>::get_feature_type()
01344 {
01345     return F_ULONG;
01346 }
01347 
01352 template<> inline EFeatureType CSparseFeatures<float32_t>::get_feature_type()
01353 {
01354     return F_SHORTREAL;
01355 }
01356 
01361 template<> inline EFeatureType CSparseFeatures<float64_t>::get_feature_type()
01362 {
01363     return F_DREAL;
01364 }
01365 
01370 template<> inline EFeatureType CSparseFeatures<floatmax_t>::get_feature_type()
01371 {
01372     return F_LONGREAL;
01373 }
01374 #endif /* _SPARSEFEATURES__H__ */

SHOGUN Machine Learning Toolbox - Documentation