00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _SPARSEFEATURES__H__
00014 #define _SPARSEFEATURES__H__
00015
00016 #include <string.h>
00017 #include <stdlib.h>
00018
00019 #include "lib/common.h"
00020 #include "lib/Mathematics.h"
00021 #include "lib/Cache.h"
00022 #include "lib/io.h"
00023 #include "lib/Cache.h"
00024 #include "lib/File.h"
00025
00026 #include "features/Labels.h"
00027 #include "features/Features.h"
00028 #include "features/DotFeatures.h"
00029 #include "features/SimpleFeatures.h"
00030 #include "preproc/SparsePreProc.h"
00031
00032 namespace shogun
00033 {
00034
00035 class CFile;
00036 class CLabels;
00037 class CFeatures;
00038 class CDotFeatures;
00039 template <class ST> class CSimpleFeatures;
00040 template <class ST> class CSparsePreProc;
00041
00042 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00043
00044 template <class ST> struct TSparseEntry
00045 {
00047 int32_t feat_index;
00049 ST entry;
00050 };
00051
00053 template <class ST> struct TSparse
00054 {
00055 public:
00057 int32_t vec_index;
00059 int32_t num_feat_entries;
00061 TSparseEntry<ST>* features;
00062 };
00063 #endif // DOXYGEN_SHOULD_SKIP_THIS
00064
00077 template <class ST> class CSparseFeatures : public CDotFeatures
00078 {
00079 public:
00084 CSparseFeatures(int32_t size=0)
00085 : CDotFeatures(size), num_vectors(0), num_features(0),
00086 sparse_feature_matrix(NULL), feature_cache(NULL)
00087 {}
00088
00097 CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00098 : CDotFeatures(0), num_vectors(0), num_features(0),
00099 sparse_feature_matrix(NULL), feature_cache(NULL)
00100 {
00101 if (!copy)
00102 set_sparse_feature_matrix(src, num_feat, num_vec);
00103 else
00104 {
00105 sparse_feature_matrix = new TSparse<ST>[num_vec];
00106 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec);
00107 for (int32_t i=0; i< num_vec; i++)
00108 {
00109 sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00110 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00111
00112 }
00113 }
00114 }
00115
00123 CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00124 : CDotFeatures(0), num_vectors(0), num_features(0),
00125 sparse_feature_matrix(NULL), feature_cache(NULL)
00126 {
00127 set_full_feature_matrix(src, num_feat, num_vec);
00128 }
00129
00131 CSparseFeatures(const CSparseFeatures & orig)
00132 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00133 num_features(orig.num_features),
00134 sparse_feature_matrix(orig.sparse_feature_matrix),
00135 feature_cache(orig.feature_cache)
00136 {
00137 if (orig.sparse_feature_matrix)
00138 {
00139 free_sparse_feature_matrix();
00140 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00141 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00142 for (int32_t i=0; i< num_vectors; i++)
00143 {
00144 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00145 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00146
00147 }
00148 }
00149 }
00150
00155 CSparseFeatures(CFile* loader)
00156 : CDotFeatures(loader), num_vectors(0), num_features(0),
00157 sparse_feature_matrix(NULL), feature_cache(NULL)
00158 {
00159 load(loader);
00160 }
00161
00163 virtual ~CSparseFeatures()
00164 {
00165 free_sparse_features();
00166 }
00167
00171 void free_sparse_feature_matrix()
00172 {
00173 clean_tsparse(sparse_feature_matrix, num_vectors);
00174 sparse_feature_matrix = NULL;
00175 num_vectors=0;
00176 num_features=0;
00177 }
00178
00182 void free_sparse_features()
00183 {
00184 free_sparse_feature_matrix();
00185 delete feature_cache;
00186 feature_cache = NULL;
00187 }
00188
00193 virtual CFeatures* duplicate() const
00194 {
00195 return new CSparseFeatures<ST>(*this);
00196 }
00197
00205 ST get_feature(int32_t num, int32_t index)
00206 {
00207 ASSERT(index>=0 && index<num_features) ;
00208 ASSERT(num>=0 && num<num_vectors) ;
00209
00210 bool vfree;
00211 int32_t num_feat;
00212 int32_t i;
00213 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00214 ST ret = 0 ;
00215
00216 if (sv)
00217 {
00218 for (i=0; i<num_feat; i++)
00219 if (sv[i].feat_index==index)
00220 ret += sv[i].entry ;
00221 }
00222
00223 free_sparse_feature_vector(sv, num, vfree);
00224
00225 return ret ;
00226 }
00227
00228
00237 ST* get_full_feature_vector(int32_t num, int32_t& len)
00238 {
00239 bool vfree;
00240 int32_t num_feat;
00241 int32_t i;
00242 len=0;
00243 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00244 ST* fv=NULL;
00245
00246 if (sv)
00247 {
00248 len=num_features;
00249 fv=new ST[num_features];
00250
00251 for (i=0; i<num_features; i++)
00252 fv[i]=0;
00253
00254 for (i=0; i<num_feat; i++)
00255 fv[sv[i].feat_index]= sv[i].entry;
00256 }
00257
00258 free_sparse_feature_vector(sv, num, vfree);
00259
00260 return fv;
00261 }
00262
00269 void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00270 {
00271 if (num>=num_vectors)
00272 {
00273 SG_ERROR("Index out of bounds (number of vectors %d, you "
00274 "requested %d)\n", num_vectors, num);
00275 }
00276
00277 bool vfree;
00278 int32_t num_feat=0;
00279 *len=0;
00280 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00281
00282 if (sv)
00283 {
00284 *len=num_features;
00285 *dst= (ST*) malloc(sizeof(ST)*num_features);
00286 memset(*dst, 0, sizeof(ST)*num_features);
00287
00288 for (int32_t i=0; i<num_feat; i++)
00289 (*dst)[sv[i].feat_index]= sv[i].entry;
00290 }
00291
00292 free_sparse_feature_vector(sv, num, vfree);
00293 }
00294
00295
00301 virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00302 {
00303 bool vfree;
00304 int32_t len;
00305 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00306 free_sparse_feature_vector(sv, num, vfree);
00307 return len;
00308 }
00309
00320 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00321 {
00322 ASSERT(num<num_vectors);
00323
00324 if (sparse_feature_matrix)
00325 {
00326 len= sparse_feature_matrix[num].num_feat_entries;
00327 vfree=false ;
00328 return sparse_feature_matrix[num].features;
00329 }
00330 else
00331 {
00332 TSparseEntry<ST>* feat=NULL;
00333 vfree=false;
00334
00335 if (feature_cache)
00336 {
00337 feat=feature_cache->lock_entry(num);
00338
00339 if (feat)
00340 return feat;
00341 else
00342 {
00343 feat=feature_cache->set_entry(num);
00344 }
00345 }
00346
00347 if (!feat)
00348 vfree=true;
00349
00350 feat=compute_sparse_feature_vector(num, len, feat);
00351
00352
00353 if (get_num_preproc())
00354 {
00355 int32_t tmp_len=len;
00356 TSparseEntry<ST>* tmp_feat_before = feat;
00357 TSparseEntry<ST>* tmp_feat_after = NULL;
00358
00359 for (int32_t i=0; i<get_num_preproc(); i++)
00360 {
00361
00362
00363 if (i!=0)
00364 delete[] tmp_feat_before;
00365 tmp_feat_before=tmp_feat_after;
00366 }
00367
00368 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00369 delete[] tmp_feat_after;
00370 len=tmp_len ;
00371 SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00372 }
00373 return feat ;
00374 }
00375 }
00376
00377
00388 static ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00389 {
00390 ST result=0;
00391
00392
00393 if (avec && bvec)
00394 {
00395 if (alen<=blen)
00396 {
00397 int32_t j=0;
00398 for (int32_t i=0; i<alen; i++)
00399 {
00400 int32_t a_feat_idx=avec[i].feat_index;
00401
00402 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00403 j++;
00404
00405 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00406 {
00407 result+= avec[i].entry * bvec[j].entry;
00408 j++;
00409 }
00410 }
00411 }
00412 else
00413 {
00414 int32_t j=0;
00415 for (int32_t i=0; i<blen; i++)
00416 {
00417 int32_t b_feat_idx=bvec[i].feat_index;
00418
00419 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00420 j++;
00421
00422 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00423 {
00424 result+= bvec[i].entry * avec[j].entry;
00425 j++;
00426 }
00427 }
00428 }
00429
00430 result*=alpha;
00431 }
00432
00433 return result;
00434 }
00435
00446 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00447 {
00448 ASSERT(vec);
00449 ASSERT(dim==num_features);
00450 ST result=b;
00451
00452 bool vfree;
00453 int32_t num_feat;
00454 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00455
00456 if (sv)
00457 {
00458 for (int32_t i=0; i<num_feat; i++)
00459 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00460 }
00461
00462 free_sparse_feature_vector(sv, num, vfree);
00463 return result;
00464 }
00465
00475 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00476 {
00477 ASSERT(vec);
00478 if (dim!=num_features)
00479 {
00480 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00481 dim, num_features);
00482 }
00483
00484 bool vfree;
00485 int32_t num_feat;
00486 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00487
00488 if (sv)
00489 {
00490 if (abs_val)
00491 {
00492 for (int32_t i=0; i<num_feat; i++)
00493 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00494 }
00495 else
00496 {
00497 for (int32_t i=0; i<num_feat; i++)
00498 vec[sv[i].feat_index]+= alpha*sv[i].entry;
00499 }
00500 }
00501
00502 free_sparse_feature_vector(sv, num, vfree);
00503 }
00504
00511 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00512 {
00513 if (feature_cache)
00514 feature_cache->unlock_entry(num);
00515
00516 if (free)
00517 delete[] feat_vec ;
00518 }
00519
00527 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00528 {
00529 num_feat=num_features;
00530 num_vec=num_vectors;
00531
00532 return sparse_feature_matrix;
00533 }
00534
00543 void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00544 int32_t* num_vec, int64_t* nnz)
00545 {
00546 *nnz=get_num_nonzero_entries();
00547 *num_feat=num_features;
00548 *num_vec=num_vectors;
00549 *dst=sparse_feature_matrix;
00550 }
00551
00557 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00558 {
00559 if (sfm)
00560 {
00561 for (int32_t i=0; i<num_vec; i++)
00562 delete[] sfm[i].features;
00563
00564 delete[] sfm;
00565 }
00566 }
00567
00572 CSparseFeatures<ST>* get_transposed()
00573 {
00574 int32_t num_feat;
00575 int32_t num_vec;
00576 TSparse<ST>* s=get_transposed(num_feat, num_vec);
00577 return new CSparseFeatures<ST>(s, num_feat, num_vec);
00578 }
00579
00589 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00590 {
00591 num_feat=num_vectors;
00592 num_vec=num_features;
00593
00594 int32_t* hist=new int32_t[num_features];
00595 memset(hist, 0, sizeof(int32_t)*num_features);
00596
00597
00598 for (int32_t v=0; v<num_vectors; v++)
00599 {
00600 int32_t vlen;
00601 bool vfree;
00602 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00603
00604 for (int32_t i=0; i<vlen; i++)
00605 hist[sv[i].feat_index]++;
00606
00607 free_sparse_feature_vector(sv, v, vfree);
00608 }
00609
00610
00611 TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00612 for (int32_t v=0; v<num_vec; v++)
00613 {
00614 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00615 sfm[v].num_feat_entries=hist[v];
00616 sfm[v].vec_index=v;
00617 }
00618
00619
00620 memset(hist,0,sizeof(int32_t)*num_features);
00621 for (int32_t v=0; v<num_vectors; v++)
00622 {
00623 int32_t vlen;
00624 bool vfree;
00625 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00626
00627 for (int32_t i=0; i<vlen; i++)
00628 {
00629 int32_t vidx=sv[i].feat_index;
00630 int32_t fidx=v;
00631 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00632 sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00633 hist[vidx]++;
00634 }
00635
00636 free_sparse_feature_vector(sv, v, vfree);
00637 }
00638
00639 delete[] hist;
00640 return sfm;
00641 }
00642
00652 virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00653 {
00654 free_sparse_feature_matrix();
00655
00656 sparse_feature_matrix=src;
00657 num_features=num_feat;
00658 num_vectors=num_vec;
00659 }
00660
00668 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00669 {
00670 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00671 num_feat=num_features;
00672 num_vec=num_vectors;
00673
00674 ST* fm=new ST[num_feat*num_vec];
00675
00676 if (fm)
00677 {
00678 for (int64_t i=0; i<num_feat*num_vec; i++)
00679 fm[i]=0;
00680
00681 for (int32_t v=0; v<num_vec; v++)
00682 {
00683 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00684 {
00685 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00686 fm[offs]= sparse_feature_matrix[v].features[f].entry;
00687 }
00688 }
00689 }
00690 else
00691 SG_ERROR( "error allocating memory for dense feature matrix\n");
00692
00693 return fm;
00694 }
00695
00703 void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00704 {
00705 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00706 *num_feat=num_features;
00707 *num_vec=num_vectors;
00708
00709 *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00710
00711 if (*dst)
00712 {
00713 for (int64_t i=0; i<num_features*num_vectors; i++)
00714 (*dst)[i]=0;
00715
00716 for (int32_t v=0; v<num_vectors; v++)
00717 {
00718 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00719 {
00720 int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00721 (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00722 }
00723 }
00724 }
00725 else
00726 SG_ERROR( "error allocating memory for dense feature matrix\n");
00727 }
00728
00738 virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00739 {
00740 free_sparse_feature_matrix();
00741 bool result=true;
00742 num_features=num_feat;
00743 num_vectors=num_vec;
00744
00745 SG_INFO("converting dense feature matrix to sparse one\n");
00746 int32_t* num_feat_entries=new int[num_vectors];
00747
00748 if (num_feat_entries)
00749 {
00750 int32_t num_total_entries=0;
00751
00752
00753 for (int32_t i=0; i< num_vec; i++)
00754 {
00755 num_feat_entries[i]=0;
00756 for (int32_t j=0; j< num_feat; j++)
00757 {
00758 if (src[i*((int64_t) num_feat) + j] != 0)
00759 num_feat_entries[i]++;
00760 }
00761 }
00762
00763 if (num_vec>0)
00764 {
00765 sparse_feature_matrix=new TSparse<ST>[num_vec];
00766
00767 if (sparse_feature_matrix)
00768 {
00769 for (int32_t i=0; i< num_vec; i++)
00770 {
00771 sparse_feature_matrix[i].vec_index=i;
00772 sparse_feature_matrix[i].num_feat_entries=0;
00773 sparse_feature_matrix[i].features= NULL;
00774
00775 if (num_feat_entries[i]>0)
00776 {
00777 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00778
00779 if (!sparse_feature_matrix[i].features)
00780 {
00781 SG_INFO( "allocation of features failed\n");
00782 return false;
00783 }
00784
00785 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00786 int32_t sparse_feat_idx=0;
00787
00788 for (int32_t j=0; j< num_feat; j++)
00789 {
00790 int64_t pos= i*num_feat + j;
00791
00792 if (src[pos] != 0)
00793 {
00794 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00795 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00796 sparse_feat_idx++;
00797 num_total_entries++;
00798 }
00799 }
00800 }
00801 }
00802 }
00803 else
00804 {
00805 SG_ERROR( "allocation of sparse feature matrix failed\n");
00806 result=false;
00807 }
00808
00809 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00810 num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00811 }
00812 else
00813 {
00814 SG_ERROR( "huh ? zero size matrix given ?\n");
00815 result=false;
00816 }
00817 }
00818 delete[] num_feat_entries;
00819 return result;
00820 }
00821
00827 virtual bool apply_preproc(bool force_preprocessing=false)
00828 {
00829 SG_INFO( "force: %d\n", force_preprocessing);
00830
00831 if ( sparse_feature_matrix && get_num_preproc() )
00832 {
00833 for (int32_t i=0; i<get_num_preproc(); i++)
00834 {
00835 if ( (!is_preprocessed(i) || force_preprocessing) )
00836 {
00837 set_preprocessed(i);
00838 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00839 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00840 return false;
00841 }
00842 return true;
00843 }
00844 return true;
00845 }
00846 else
00847 {
00848 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00849 return false;
00850 }
00851 }
00852
00857 virtual int32_t get_size() { return sizeof(ST); }
00858
00864 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00865 {
00866 int32_t num_feat=0;
00867 int32_t num_vec=0;
00868 ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00869 ASSERT(fm && num_feat>0 && num_vec>0);
00870
00871 return set_full_feature_matrix(fm, num_feat, num_vec);
00872 }
00873
00878 virtual inline int32_t get_num_vectors() { return num_vectors; }
00879
00884 inline int32_t get_num_features() { return num_features; }
00885
00897 inline int32_t set_num_features(int32_t num)
00898 {
00899 int32_t n=num_features;
00900 ASSERT(n<=num);
00901 num_features=num;
00902 return num_features;
00903 }
00904
00909 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00910
00915 inline virtual EFeatureType get_feature_type();
00916
00923 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00924 {
00925 if (feature_cache)
00926 feature_cache->unlock_entry(num);
00927
00928 if (free)
00929 delete[] feat_vec ;
00930 }
00931
00936 int64_t get_num_nonzero_entries()
00937 {
00938 int64_t num=0;
00939 for (int32_t i=0; i<num_vectors; i++)
00940 num+=sparse_feature_matrix[i].num_feat_entries;
00941
00942 return num;
00943 }
00944
00950 float64_t* compute_squared(float64_t* sq)
00951 {
00952 ASSERT(sq);
00953
00954 int32_t len=0;
00955 bool do_free=false;
00956
00957 for (int32_t i=0; i<this->get_num_vectors(); i++)
00958 {
00959 sq[i]=0;
00960 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00961
00962 for (int32_t j=0; j<len; j++)
00963 sq[i] += vec[j].entry * vec[j].entry;
00964
00965 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00966 }
00967
00968 return sq;
00969 }
00970
00983 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00984 {
00985 int32_t i,j;
00986 int32_t alen, blen;
00987 bool afree, bfree;
00988 ASSERT(lhs);
00989 ASSERT(rhs);
00990
00991 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00992 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00993 ASSERT(avec);
00994 ASSERT(bvec);
00995
00996 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00997
00998 if (alen<=blen)
00999 {
01000 j=0;
01001 for (i=0; i<alen; i++)
01002 {
01003 int32_t a_feat_idx=avec[i].feat_index;
01004
01005 while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
01006 j++;
01007
01008 if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
01009 {
01010 result-=2*(avec[i].entry*bvec[j].entry);
01011 j++;
01012 }
01013 }
01014 }
01015 else
01016 {
01017 j=0;
01018 for (i=0; i<blen; i++)
01019 {
01020 int32_t b_feat_idx=bvec[i].feat_index;
01021
01022 while ((j<alen) && (avec[j].feat_index<b_feat_idx))
01023 j++;
01024
01025 if ((j<alen) && (avec[j].feat_index == b_feat_idx))
01026 {
01027 result-=2*(bvec[i].entry*avec[j].entry);
01028 j++;
01029 }
01030 }
01031 }
01032
01033 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
01034 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
01035
01036 return CMath::abs(result);
01037 }
01038
01043 void load(CFile* loader);
01044
01049 void save(CFile* writer);
01050
01058 CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01059 {
01060 CLabels* lab=NULL;
01061
01062 size_t blocksize=1024*1024;
01063 size_t required_blocksize=blocksize;
01064 uint8_t* dummy=new uint8_t[blocksize];
01065 FILE* f=fopen(fname, "ro");
01066
01067 if (f)
01068 {
01069 free_sparse_feature_matrix();
01070 num_vectors=0;
01071 num_features=0;
01072
01073 SG_INFO("counting line numbers in file %s\n", fname);
01074 size_t sz=blocksize;
01075 size_t block_offs=0;
01076 size_t old_block_offs=0;
01077 fseek(f, 0, SEEK_END);
01078 size_t fsize=ftell(f);
01079 rewind(f);
01080
01081 while (sz == blocksize)
01082 {
01083 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01084 bool contains_cr=false;
01085 for (size_t i=0; i<sz; i++)
01086 {
01087 block_offs++;
01088 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01089 {
01090 num_vectors++;
01091 contains_cr=true;
01092 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01093 old_block_offs=block_offs;
01094 }
01095 }
01096 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01097 }
01098
01099 SG_INFO("found %d feature vectors\n", num_vectors);
01100 delete[] dummy;
01101 blocksize=required_blocksize;
01102 dummy = new uint8_t[blocksize+1];
01103
01104 lab=new CLabels(num_vectors);
01105 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01106
01107 rewind(f);
01108 sz=blocksize;
01109 int32_t lines=0;
01110 while (sz == blocksize)
01111 {
01112 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01113
01114 size_t old_sz=0;
01115 for (size_t i=0; i<sz; i++)
01116 {
01117 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01118 {
01119 size_t len=i-old_sz+1;
01120 uint8_t* data=&dummy[old_sz];
01121
01122 for (int32_t j=0; j<len; j++)
01123 dummy[j]=data[j];
01124
01125 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01126 i=0;
01127 old_sz=0;
01128 sz+=len;
01129 }
01130
01131 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01132 {
01133
01134 size_t len=i-old_sz;
01135 uint8_t* data=&dummy[old_sz];
01136
01137 int32_t dims=0;
01138 for (int32_t j=0; j<len; j++)
01139 {
01140 if (data[j]==':')
01141 dims++;
01142 }
01143
01144 if (dims<=0)
01145 {
01146 SG_ERROR("Error in line %d - number of"
01147 " dimensions is %d line is %d characters"
01148 " long\n line_content:'%.*s'\n", lines,
01149 dims, len, len, (const char*) data);
01150 }
01151
01152 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01153 int32_t j=0;
01154 for (; j<len; j++)
01155 {
01156 if (data[j]==' ')
01157 {
01158 data[j]='\0';
01159
01160 lab->set_label(lines, atof((const char*) data));
01161 break;
01162 }
01163 }
01164
01165 int32_t d=0;
01166 j++;
01167 uint8_t* start=&data[j];
01168 for (; j<len; j++)
01169 {
01170 if (data[j]==':')
01171 {
01172 data[j]='\0';
01173
01174 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01175 num_features=CMath::max(num_features, feat[d].feat_index+1);
01176
01177 j++;
01178 start=&data[j];
01179 for (; j<len; j++)
01180 {
01181 if (data[j]==' ' || data[j]=='\n')
01182 {
01183 data[j]='\0';
01184 feat[d].entry=(ST) atof((const char*) start);
01185 d++;
01186 break;
01187 }
01188 }
01189
01190 if (j==len)
01191 {
01192 data[j]='\0';
01193 feat[dims-1].entry=(ST) atof((const char*) start);
01194 }
01195
01196 j++;
01197 start=&data[j];
01198 }
01199 }
01200
01201 sparse_feature_matrix[lines].vec_index=lines;
01202 sparse_feature_matrix[lines].num_feat_entries=dims;
01203 sparse_feature_matrix[lines].features=feat;
01204
01205 old_sz=i+1;
01206 lines++;
01207 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01208 }
01209 }
01210 }
01211 SG_INFO("file successfully read\n");
01212 fclose(f);
01213 }
01214
01215 delete[] dummy;
01216
01217 if (do_sort_features)
01218 sort_features();
01219
01220 return lab;
01221 }
01222
01225 void sort_features()
01226 {
01227 ASSERT(get_num_preproc()==0);
01228
01229 if (!sparse_feature_matrix)
01230 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01231
01232 for (int32_t i=0; i<num_vectors; i++)
01233 {
01234 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01235
01236 if (!len)
01237 continue;
01238
01239 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01240 int32_t* feat_idx=new int32_t[len];
01241 int32_t* orig_idx=new int32_t[len];
01242
01243 for (int j=0; j<len; j++)
01244 {
01245 feat_idx[j]=sf_orig[j].feat_index;
01246 orig_idx[j]=j;
01247 }
01248
01249 CMath::qsort_index(feat_idx, orig_idx, len);
01250
01251 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len];
01252 for (int j=0; j<len; j++)
01253 sf_new[j]=sf_orig[orig_idx[j]];
01254
01255 sparse_feature_matrix[i].features=sf_new;
01256
01257
01258 for (int j=0; j<len-1; j++)
01259 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01260
01261 delete[] orig_idx;
01262 delete[] feat_idx;
01263 delete[] sf_orig;
01264 }
01265 }
01266
01273 bool write_svmlight_file(char* fname, CLabels* label)
01274 {
01275 ASSERT(label);
01276 int32_t num=label->get_num_labels();
01277 ASSERT(num>0);
01278 ASSERT(num==num_vectors);
01279
01280 FILE* f=fopen(fname, "wb");
01281
01282 if (f)
01283 {
01284 for (int32_t i=0; i<num; i++)
01285 {
01286 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01287
01288 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01289 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01290
01291 for (int32_t j=0; j<num_feat; j++)
01292 {
01293 if (j<num_feat-1)
01294 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01295 else
01296 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01297 }
01298 }
01299
01300 fclose(f);
01301 return true;
01302 }
01303 return false;
01304 }
01305
01313 virtual int32_t get_dim_feature_space()
01314 {
01315 return num_features;
01316 }
01317
01324 virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2)
01325 {
01326 bool afree, bfree;
01327 int32_t alen, blen;
01328 TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01329 TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree);
01330
01331 float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01332
01333 free_sparse_feature_vector(avec, vec_idx1, afree);
01334 free_sparse_feature_vector(bvec, vec_idx2, bfree);
01335
01336 return result;
01337 }
01338
01345 virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01346 {
01347 ASSERT(vec2);
01348 if (vec2_len!=num_features)
01349 {
01350 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
01351 vec2_len, num_features);
01352 }
01353 float64_t result=0;
01354
01355 bool vfree;
01356 int32_t vlen;
01357 TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01358
01359 if (sv)
01360 {
01361 for (int32_t i=0; i<vlen; i++)
01362 result+=vec2[sv[i].feat_index]*sv[i].entry;
01363 }
01364
01365 free_sparse_feature_vector(sv, vec_idx1, vfree);
01366
01367 return result;
01368 }
01369
01371 struct sparse_feature_iterator
01372 {
01374 TSparseEntry<ST>* sv;
01376 int32_t vidx;
01378 int32_t num_feat_entries;
01380 bool vfree;
01381
01383 int32_t index;
01384
01386 void print_info()
01387 {
01388 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n",
01389 sv, vidx, num_feat_entries, index);
01390 }
01391 };
01392
01402 virtual void* get_feature_iterator(int32_t vector_index)
01403 {
01404 if (vector_index>=num_vectors)
01405 {
01406 SG_ERROR("Index out of bounds (number of vectors %d, you "
01407 "requested %d)\n", num_vectors, vector_index);
01408 }
01409
01410 if (!sparse_feature_matrix)
01411 SG_ERROR("Requires a in-memory feature matrix\n");
01412
01413 sparse_feature_iterator* it=new sparse_feature_iterator[1];
01414 it->sv=get_sparse_feature_vector(vector_index, it->num_feat_entries, it->vfree);
01415 it->vidx=vector_index;
01416 it->index=0;
01417
01418 return it;
01419 }
01420
01431 virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator)
01432 {
01433 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01434 if (!it || it->index >= it->num_feat_entries)
01435 return false;
01436
01437 int32_t i=it->index++;
01438
01439 index = it->sv[i].feat_index;
01440 value = (float64_t) it->sv[i].entry;
01441
01442 return true;
01443 }
01444
01450 virtual void free_feature_iterator(void* iterator)
01451 {
01452 if (!iterator)
01453 return;
01454
01455 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01456 free_sparse_feature_vector(it->sv, it->vidx, it->vfree);
01457 delete[] it;
01458 }
01459
01461 inline virtual const char* get_name() const { return "SparseFeatures"; }
01462
01463 protected:
01474 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01475 {
01476 len=0;
01477 return NULL;
01478 }
01479
01480 protected:
01481
01483 int32_t num_vectors;
01484
01486 int32_t num_features;
01487
01489 TSparse<ST>* sparse_feature_matrix;
01490
01492 CCache< TSparseEntry<ST> >* feature_cache;
01493 };
01494
01495 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01496 #define GET_FEATURE_TYPE(sg_type, f_type) \
01497 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
01498 { \
01499 return f_type; \
01500 }
01501 GET_FEATURE_TYPE(bool, F_BOOL)
01502 GET_FEATURE_TYPE(char, F_CHAR)
01503 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01504 GET_FEATURE_TYPE(int16_t, F_SHORT)
01505 GET_FEATURE_TYPE(uint16_t, F_WORD)
01506 GET_FEATURE_TYPE(int32_t, F_INT)
01507 GET_FEATURE_TYPE(uint32_t, F_UINT)
01508 GET_FEATURE_TYPE(int64_t, F_LONG)
01509 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01510 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01511 GET_FEATURE_TYPE(float64_t, F_DREAL)
01512 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01513 #undef GET_FEATURE_TYPE
01514
01515 #define LOAD(fname, sg_type) \
01516 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader) \
01517 { \
01518 ASSERT(loader); \
01519 TSparse<sg_type>* matrix=NULL; \
01520 int32_t num_feat=0; \
01521 int32_t num_vec=0; \
01522 loader->fname(matrix, num_feat, num_vec); \
01523 set_sparse_feature_matrix(matrix, num_feat, num_vec); \
01524 }
01525 LOAD(get_bool_sparsematrix, bool)
01526 LOAD(get_char_sparsematrix, char)
01527 LOAD(get_byte_sparsematrix, uint8_t)
01528 LOAD(get_short_sparsematrix, int16_t)
01529 LOAD(get_word_sparsematrix, uint16_t)
01530 LOAD(get_int_sparsematrix, int32_t)
01531 LOAD(get_uint_sparsematrix, uint32_t)
01532 LOAD(get_long_sparsematrix, int64_t)
01533 LOAD(get_ulong_sparsematrix, uint64_t)
01534 LOAD(get_shortreal_sparsematrix, float32_t)
01535 LOAD(get_real_sparsematrix, float64_t)
01536 LOAD(get_longreal_sparsematrix, floatmax_t)
01537 #undef LOAD
01538
01539 #define WRITE(fname, sg_type) \
01540 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer) \
01541 { \
01542 ASSERT(writer); \
01543 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
01544 }
01545 WRITE(set_bool_sparsematrix, bool)
01546 WRITE(set_char_sparsematrix, char)
01547 WRITE(set_byte_sparsematrix, uint8_t)
01548 WRITE(set_short_sparsematrix, int16_t)
01549 WRITE(set_word_sparsematrix, uint16_t)
01550 WRITE(set_int_sparsematrix, int32_t)
01551 WRITE(set_uint_sparsematrix, uint32_t)
01552 WRITE(set_long_sparsematrix, int64_t)
01553 WRITE(set_ulong_sparsematrix, uint64_t)
01554 WRITE(set_shortreal_sparsematrix, float32_t)
01555 WRITE(set_real_sparsematrix, float64_t)
01556 WRITE(set_longreal_sparsematrix, floatmax_t)
01557 #undef WRITE
01558 #endif // DOXYGEN_SHOULD_SKIP_THIS
01559 }
01560 #endif