00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014
00015 #include <string.h>
00016 #include <stdlib.h>
00017
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/DotFeatures.h"
00027 #include "features/SimpleFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029
00030 template <class ST> class CSparsePreProc;
00031
00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00033
00034 template <class ST> struct TSparseEntry
00035 {
00037 int32_t feat_index;
00039 ST entry;
00040 };
00041
00043 template <class ST> struct TSparse
00044 {
00045 public:
00047 int32_t vec_index;
00049 int32_t num_feat_entries;
00051 TSparseEntry<ST>* features;
00052 };
00053 #endif // DOXYGEN_SHOULD_SKIP_THIS
00054
00067 template <class ST> class CSparseFeatures : public CDotFeatures
00068 {
00069 public:
00074 CSparseFeatures(int32_t size=0)
00075 : CDotFeatures(size), num_vectors(0), num_features(0),
00076 sparse_feature_matrix(NULL), feature_cache(NULL)
00077 {}
00078
00086 CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00087 : CDotFeatures(0), num_vectors(0), num_features(0),
00088 sparse_feature_matrix(NULL), feature_cache(NULL)
00089 {
00090 set_sparse_feature_matrix(src, num_feat, num_vec);
00091 }
00092
00100 CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00101 : CDotFeatures(0), num_vectors(0), num_features(0),
00102 sparse_feature_matrix(NULL), feature_cache(NULL)
00103 {
00104 set_full_feature_matrix(src, num_feat, num_vec);
00105 }
00106
00108 CSparseFeatures(const CSparseFeatures & orig)
00109 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00110 num_features(orig.num_features),
00111 sparse_feature_matrix(orig.sparse_feature_matrix),
00112 feature_cache(orig.feature_cache)
00113 {
00114 if (orig.sparse_feature_matrix)
00115 {
00116 free_sparse_feature_matrix();
00117 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00118 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00119 for (int32_t i=0; i< num_vectors; i++)
00120 {
00121 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00122 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00123
00124 }
00125 }
00126 }
00127
00132 CSparseFeatures(char* fname)
00133 : CDotFeatures(fname), num_vectors(0), num_features(0),
00134 sparse_feature_matrix(NULL), feature_cache(NULL)
00135 {}
00136
00137 virtual ~CSparseFeatures()
00138 {
00139 free_sparse_features();
00140 }
00141
00145 void free_sparse_feature_matrix()
00146 {
00147 clean_tsparse(sparse_feature_matrix, num_vectors);
00148 sparse_feature_matrix = NULL;
00149 num_vectors=0;
00150 num_features=0;
00151 }
00152
00156 void free_sparse_features()
00157 {
00158 free_sparse_feature_matrix();
00159 delete feature_cache;
00160 feature_cache = NULL;
00161 }
00162
00167 virtual CFeatures* duplicate() const
00168 {
00169 return new CSparseFeatures<ST>(*this);
00170 }
00171
00180 ST* get_full_feature_vector(int32_t num, int32_t& len)
00181 {
00182 bool vfree;
00183 int32_t num_feat;
00184 int32_t i;
00185 len=0;
00186 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00187 ST* fv=NULL;
00188
00189 if (sv)
00190 {
00191 len=num_features;
00192 fv=new ST[num_features];
00193
00194 for (i=0; i<num_features; i++)
00195 fv[i]=0;
00196
00197 for (i=0; i<num_feat; i++)
00198 fv[sv[i].feat_index]= sv[i].entry;
00199 }
00200
00201 free_sparse_feature_vector(sv, num, vfree);
00202
00203 return fv;
00204 }
00205
00212 void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00213 {
00214 if (num>=num_vectors)
00215 {
00216 SG_ERROR("Index out of bounds (number of vectors %d, you "
00217 "requested %d)\n", num_vectors, num);
00218 }
00219
00220 bool vfree;
00221 int32_t num_feat=0;
00222 *len=0;
00223 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00224
00225 if (sv)
00226 {
00227 *len=num_features;
00228 *dst= (ST*) malloc(sizeof(ST)*num_features);
00229 memset(*dst, 0, sizeof(ST)*num_features);
00230
00231 for (int32_t i=0; i<num_feat; i++)
00232 (*dst)[sv[i].feat_index]= sv[i].entry;
00233 }
00234
00235 free_sparse_feature_vector(sv, num, vfree);
00236 }
00237
00238
00244 virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00245 {
00246 bool vfree;
00247 int32_t len;
00248 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00249 free_sparse_feature_vector(sv, num, vfree);
00250 return len;
00251 }
00252
00263 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00264 {
00265 ASSERT(num<num_vectors);
00266
00267 if (sparse_feature_matrix)
00268 {
00269 len= sparse_feature_matrix[num].num_feat_entries;
00270 vfree=false ;
00271 return sparse_feature_matrix[num].features;
00272 }
00273 else
00274 {
00275 TSparseEntry<ST>* feat=NULL;
00276 vfree=false;
00277
00278 if (feature_cache)
00279 {
00280 feat=feature_cache->lock_entry(num);
00281
00282 if (feat)
00283 return feat;
00284 else
00285 {
00286 feat=feature_cache->set_entry(num);
00287 }
00288 }
00289
00290 if (!feat)
00291 vfree=true;
00292
00293 feat=compute_sparse_feature_vector(num, len, feat);
00294
00295
00296 if (get_num_preproc())
00297 {
00298 int32_t tmp_len=len;
00299 TSparseEntry<ST>* tmp_feat_before = feat;
00300 TSparseEntry<ST>* tmp_feat_after = NULL;
00301
00302 for (int32_t i=0; i<get_num_preproc(); i++)
00303 {
00304
00305
00306 if (i!=0)
00307 delete[] tmp_feat_before;
00308 tmp_feat_before=tmp_feat_after;
00309 }
00310
00311 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00312 delete[] tmp_feat_after;
00313 len=tmp_len ;
00314 SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00315 }
00316 return feat ;
00317 }
00318 }
00319
00320
00331 ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00332 {
00333 ST result=0;
00334
00335
00336 if (avec && bvec)
00337 {
00338 if (alen<=blen)
00339 {
00340 int32_t j=0;
00341 for (int32_t i=0; i<alen; i++)
00342 {
00343 int32_t a_feat_idx=avec[i].feat_index;
00344
00345 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00346 j++;
00347
00348 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00349 {
00350 result+= avec[i].entry * bvec[j].entry;
00351 j++;
00352 }
00353 }
00354 }
00355 else
00356 {
00357 int32_t j=0;
00358 for (int32_t i=0; i<blen; i++)
00359 {
00360 int32_t b_feat_idx=bvec[i].feat_index;
00361
00362 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00363 j++;
00364
00365 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00366 {
00367 result+= bvec[i].entry * avec[j].entry;
00368 j++;
00369 }
00370 }
00371 }
00372
00373 result*=alpha;
00374 }
00375
00376 return result;
00377 }
00378
00389 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00390 {
00391 ASSERT(vec);
00392 ASSERT(dim==num_features);
00393 ST result=b;
00394
00395 bool vfree;
00396 int32_t num_feat;
00397 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00398
00399 if (sv)
00400 {
00401 for (int32_t i=0; i<num_feat; i++)
00402 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00403 }
00404
00405 free_sparse_feature_vector(sv, num, vfree);
00406 return result;
00407 }
00408
00418 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00419 {
00420 ASSERT(vec);
00421 ASSERT(dim==num_features);
00422
00423 bool vfree;
00424 int32_t num_feat;
00425 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00426
00427 if (sv)
00428 {
00429 if (abs_val)
00430 {
00431 for (int32_t i=0; i<num_feat; i++)
00432 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00433 }
00434 else
00435 {
00436 for (int32_t i=0; i<num_feat; i++)
00437 vec[sv[i].feat_index]+= alpha*sv[i].entry;
00438 }
00439 }
00440
00441 free_sparse_feature_vector(sv, num, vfree);
00442 }
00443
00450 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00451 {
00452 if (feature_cache)
00453 feature_cache->unlock_entry(num);
00454
00455 if (free)
00456 delete[] feat_vec ;
00457 }
00458
00466 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00467 {
00468 num_feat=num_features;
00469 num_vec=num_vectors;
00470
00471 return sparse_feature_matrix;
00472 }
00473
00482 void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00483 int32_t* num_vec, int64_t* nnz)
00484 {
00485 *nnz=get_num_nonzero_entries();
00486 *num_feat=num_features;
00487 *num_vec=num_vectors;
00488 *dst=sparse_feature_matrix;
00489 }
00490
00496 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00497 {
00498 if (sfm)
00499 {
00500 for (int32_t i=0; i<num_vec; i++)
00501 delete[] sfm[i].features;
00502
00503 delete[] sfm;
00504 }
00505 }
00506
00516 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00517 {
00518 num_feat=num_vectors;
00519 num_vec=num_features;
00520
00521 int32_t* hist=new int32_t[num_features];
00522 memset(hist, 0, sizeof(int32_t)*num_features);
00523
00524
00525 for (int32_t v=0; v<num_vectors; v++)
00526 {
00527 int32_t vlen;
00528 bool vfree;
00529 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00530
00531 for (int32_t i=0; i<vlen; i++)
00532 hist[sv[i].feat_index]++;
00533
00534 free_sparse_feature_vector(sv, v, vfree);
00535 }
00536
00537
00538 TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00539 for (int32_t v=0; v<num_vec; v++)
00540 {
00541 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00542 sfm[v].num_feat_entries=hist[v];
00543 sfm[v].vec_index=v;
00544 }
00545
00546
00547 memset(hist,0,sizeof(int32_t)*num_features);
00548 for (int32_t v=0; v<num_vectors; v++)
00549 {
00550 int32_t vlen;
00551 bool vfree;
00552 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00553
00554 for (int32_t i=0; i<vlen; i++)
00555 {
00556 int32_t vidx=sv[i].feat_index;
00557 int32_t fidx=v;
00558 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00559 sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00560 hist[vidx]++;
00561 }
00562
00563 free_sparse_feature_vector(sv, v, vfree);
00564 }
00565
00566 delete[] hist;
00567 return sfm;
00568 }
00569
00579 virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00580 {
00581 free_sparse_feature_matrix();
00582
00583 sparse_feature_matrix=src;
00584 num_features=num_feat;
00585 num_vectors=num_vec;
00586 }
00587
00595 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00596 {
00597 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00598 num_feat=num_features;
00599 num_vec=num_vectors;
00600
00601 ST* fm=new ST[num_feat*num_vec];
00602
00603 if (fm)
00604 {
00605 for (int64_t i=0; i<num_feat*num_vec; i++)
00606 fm[i]=0;
00607
00608 for (int32_t v=0; v<num_vec; v++)
00609 {
00610 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00611 {
00612 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00613 fm[offs]= sparse_feature_matrix[v].features[f].entry;
00614 }
00615 }
00616 }
00617 else
00618 SG_ERROR( "error allocating memory for dense feature matrix\n");
00619
00620 return fm;
00621 }
00622
00630 void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00631 {
00632 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00633 *num_feat=num_features;
00634 *num_vec=num_vectors;
00635
00636 *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00637
00638 if (*dst)
00639 {
00640 for (int64_t i=0; i<num_features*num_vectors; i++)
00641 (*dst)[i]=0;
00642
00643 for (int32_t v=0; v<num_vectors; v++)
00644 {
00645 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00646 {
00647 int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00648 (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00649 }
00650 }
00651 }
00652 else
00653 SG_ERROR( "error allocating memory for dense feature matrix\n");
00654 }
00655
00665 virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00666 {
00667 free_sparse_feature_matrix();
00668 bool result=true;
00669 num_features=num_feat;
00670 num_vectors=num_vec;
00671
00672 SG_INFO("converting dense feature matrix to sparse one\n");
00673 int32_t* num_feat_entries=new int[num_vectors];
00674
00675 if (num_feat_entries)
00676 {
00677 int32_t num_total_entries=0;
00678
00679
00680 for (int32_t i=0; i< num_vec; i++)
00681 {
00682 num_feat_entries[i]=0;
00683 for (int32_t j=0; j< num_feat; j++)
00684 {
00685 if (src[i*((int64_t) num_feat) + j] != 0)
00686 num_feat_entries[i]++;
00687 }
00688 }
00689
00690 if (num_vec>0)
00691 {
00692 sparse_feature_matrix=new TSparse<ST>[num_vec];
00693
00694 if (sparse_feature_matrix)
00695 {
00696 for (int32_t i=0; i< num_vec; i++)
00697 {
00698 sparse_feature_matrix[i].vec_index=i;
00699 sparse_feature_matrix[i].num_feat_entries=0;
00700 sparse_feature_matrix[i].features= NULL;
00701
00702 if (num_feat_entries[i]>0)
00703 {
00704 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00705
00706 if (!sparse_feature_matrix[i].features)
00707 {
00708 SG_INFO( "allocation of features failed\n");
00709 return false;
00710 }
00711
00712 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00713 int32_t sparse_feat_idx=0;
00714
00715 for (int32_t j=0; j< num_feat; j++)
00716 {
00717 int64_t pos= i*num_feat + j;
00718
00719 if (src[pos] != 0)
00720 {
00721 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00722 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00723 sparse_feat_idx++;
00724 num_total_entries++;
00725 }
00726 }
00727 }
00728 }
00729 }
00730 else
00731 {
00732 SG_ERROR( "allocation of sparse feature matrix failed\n");
00733 result=false;
00734 }
00735
00736 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00737 num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00738 }
00739 else
00740 {
00741 SG_ERROR( "huh ? zero size matrix given ?\n");
00742 result=false;
00743 }
00744 }
00745 delete[] num_feat_entries;
00746 return result;
00747 }
00748
00754 virtual bool apply_preproc(bool force_preprocessing=false)
00755 {
00756 SG_INFO( "force: %d\n", force_preprocessing);
00757
00758 if ( sparse_feature_matrix && get_num_preproc() )
00759 {
00760 for (int32_t i=0; i<get_num_preproc(); i++)
00761 {
00762 if ( (!is_preprocessed(i) || force_preprocessing) )
00763 {
00764 set_preprocessed(i);
00765 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00766 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00767 return false;
00768 }
00769 return true;
00770 }
00771 return true;
00772 }
00773 else
00774 {
00775 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00776 return false;
00777 }
00778 }
00779
00784 virtual int32_t get_size() { return sizeof(ST); }
00785
00791 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00792 {
00793 int32_t num_feat=0;
00794 int32_t num_vec=0;
00795 ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00796 ASSERT(fm && num_feat>0 && num_vec>0);
00797
00798 return set_full_feature_matrix(fm, num_feat, num_vec);
00799 }
00800
00805 virtual inline int32_t get_num_vectors() { return num_vectors; }
00806
00811 inline int32_t get_num_features() { return num_features; }
00812
00824 inline int32_t set_num_features(int32_t num)
00825 {
00826 int32_t n=num_features;
00827 ASSERT(n<=num);
00828 num_features=num;
00829 return num_features;
00830 }
00831
00836 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00837
00842 inline virtual EFeatureType get_feature_type();
00843
00850 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00851 {
00852 if (feature_cache)
00853 feature_cache->unlock_entry(num);
00854
00855 if (free)
00856 delete[] feat_vec ;
00857 }
00858
00863 int64_t get_num_nonzero_entries()
00864 {
00865 int64_t num=0;
00866 for (int32_t i=0; i<num_vectors; i++)
00867 num+=sparse_feature_matrix[i].num_feat_entries;
00868
00869 return num;
00870 }
00871
00877 float64_t* compute_squared(float64_t* sq)
00878 {
00879 ASSERT(sq);
00880
00881 int32_t len=0;
00882 bool do_free=false;
00883
00884 for (int32_t i=0; i<this->get_num_vectors(); i++)
00885 {
00886 sq[i]=0;
00887 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00888
00889 for (int32_t j=0; j<len; j++)
00890 sq[i] += vec[j].entry * vec[j].entry;
00891
00892 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00893 }
00894
00895 return sq;
00896 }
00897
00910 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00911 {
00912 int32_t i,j;
00913 int32_t alen, blen;
00914 bool afree, bfree;
00915 ASSERT(lhs);
00916 ASSERT(rhs);
00917
00918 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00919 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00920 ASSERT(avec);
00921 ASSERT(bvec);
00922
00923 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00924
00925 if (alen<=blen)
00926 {
00927 j=0;
00928 for (i=0; i<alen; i++)
00929 {
00930 int32_t a_feat_idx=avec[i].feat_index;
00931
00932 while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00933 j++;
00934
00935 if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00936 {
00937 result-=2*(avec[i].entry*bvec[j].entry);
00938 j++;
00939 }
00940 }
00941 }
00942 else
00943 {
00944 j=0;
00945 for (i=0; i<blen; i++)
00946 {
00947 int32_t b_feat_idx=bvec[i].feat_index;
00948
00949 while ((j<alen) && (avec[j].feat_index<b_feat_idx))
00950 j++;
00951
00952 if ((j<alen) && (avec[j].feat_index == b_feat_idx))
00953 {
00954 result-=2*(bvec[i].entry*avec[j].entry);
00955 j++;
00956 }
00957 }
00958 }
00959
00960 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
00961 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00962
00963 return CMath::abs(result);
00964 }
00965
00971 CLabels* load_svmlight_file(char* fname)
00972 {
00973 CLabels* lab=NULL;
00974
00975 size_t blocksize=1024*1024;
00976 size_t required_blocksize=blocksize;
00977 uint8_t* dummy=new uint8_t[blocksize];
00978 FILE* f=fopen(fname, "ro");
00979
00980 if (f)
00981 {
00982 free_sparse_feature_matrix();
00983 num_vectors=0;
00984 num_features=0;
00985
00986 SG_INFO("counting line numbers in file %s\n", fname);
00987 size_t sz=blocksize;
00988 size_t block_offs=0;
00989 size_t old_block_offs=0;
00990 fseek(f, 0, SEEK_END);
00991 size_t fsize=ftell(f);
00992 rewind(f);
00993
00994 while (sz == blocksize)
00995 {
00996 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00997 bool contains_cr=false;
00998 for (size_t i=0; i<sz; i++)
00999 {
01000 block_offs++;
01001 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01002 {
01003 num_vectors++;
01004 contains_cr=true;
01005 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01006 old_block_offs=block_offs;
01007 }
01008 }
01009 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01010 }
01011
01012 SG_INFO("found %d feature vectors\n", num_vectors);
01013 delete[] dummy;
01014 blocksize=required_blocksize;
01015 dummy = new uint8_t[blocksize+1];
01016
01017 lab=new CLabels(num_vectors);
01018 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01019
01020 rewind(f);
01021 sz=blocksize;
01022 int32_t lines=0;
01023 while (sz == blocksize)
01024 {
01025 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01026
01027 size_t old_sz=0;
01028 for (size_t i=0; i<sz; i++)
01029 {
01030 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01031 {
01032 size_t len=i-old_sz+1;
01033 uint8_t* data=&dummy[old_sz];
01034
01035 for (int32_t j=0; j<len; j++)
01036 dummy[j]=data[j];
01037
01038 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01039 i=0;
01040 old_sz=0;
01041 sz+=len;
01042 }
01043
01044 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01045 {
01046
01047 size_t len=i-old_sz;
01048 uint8_t* data=&dummy[old_sz];
01049
01050 int32_t dims=0;
01051 for (int32_t j=0; j<len; j++)
01052 {
01053 if (data[j]==':')
01054 dims++;
01055 }
01056
01057 if (dims<=0)
01058 {
01059 SG_ERROR("Error in line %d - number of"
01060 " dimensions is %d line is %d characters"
01061 " long\n line_content:'%.*s'\n", lines,
01062 dims, len, len, (const char*) data);
01063 }
01064
01065 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01066 int32_t j=0;
01067 for (; j<len; j++)
01068 {
01069 if (data[j]==' ')
01070 {
01071 data[j]='\0';
01072
01073 lab->set_label(lines, atof((const char*) data));
01074 break;
01075 }
01076 }
01077
01078 int32_t d=0;
01079 j++;
01080 uint8_t* start=&data[j];
01081 for (; j<len; j++)
01082 {
01083 if (data[j]==':')
01084 {
01085 data[j]='\0';
01086
01087 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01088 num_features=CMath::max(num_features, feat[d].feat_index+1);
01089
01090 j++;
01091 start=&data[j];
01092 for (; j<len; j++)
01093 {
01094 if (data[j]==' ' || data[j]=='\n')
01095 {
01096 data[j]='\0';
01097 feat[d].entry=(ST) atof((const char*) start);
01098 d++;
01099 break;
01100 }
01101 }
01102
01103 if (j==len)
01104 {
01105 data[j]='\0';
01106 feat[dims-1].entry=(ST) atof((const char*) start);
01107 }
01108
01109 j++;
01110 start=&data[j];
01111 }
01112 }
01113
01114 sparse_feature_matrix[lines].vec_index=lines;
01115 sparse_feature_matrix[lines].num_feat_entries=dims;
01116 sparse_feature_matrix[lines].features=feat;
01117
01118 old_sz=i+1;
01119 lines++;
01120 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01121 }
01122 }
01123 }
01124 SG_INFO("file successfully read\n");
01125 fclose(f);
01126 }
01127
01128 delete[] dummy;
01129
01130 return lab;
01131 }
01132
01139 bool write_svmlight_file(char* fname, CLabels* label)
01140 {
01141 ASSERT(label);
01142 int32_t num=label->get_num_labels();
01143 ASSERT(num>0);
01144 ASSERT(num==num_vectors);
01145
01146 FILE* f=fopen(fname, "wb");
01147
01148 if (f)
01149 {
01150 for (int32_t i=0; i<num; i++)
01151 {
01152 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01153
01154 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01155 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01156
01157 for (int32_t j=0; j<num_feat; j++)
01158 {
01159 if (j<num_feat-1)
01160 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01161 else
01162 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01163 }
01164 }
01165
01166 fclose(f);
01167 return true;
01168 }
01169 return false;
01170 }
01171
01179 virtual int32_t get_dim_feature_space()
01180 {
01181 return num_features;
01182 }
01183
01190 virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2)
01191 {
01192 bool afree, bfree;
01193 int32_t alen, blen;
01194 TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01195 TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree);
01196
01197 float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01198
01199 free_sparse_feature_vector(avec, vec_idx1, afree);
01200 free_sparse_feature_vector(bvec, vec_idx2, bfree);
01201
01202 return result;
01203 }
01204
01211 virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01212 {
01213 ASSERT(vec2);
01214 ASSERT(vec2_len==num_features);
01215 float64_t result=0;
01216
01217 bool vfree;
01218 int32_t vlen;
01219 TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01220
01221 if (sv)
01222 {
01223 for (int32_t i=0; i<vlen; i++)
01224 result+=vec2[sv[i].feat_index]*sv[i].entry;
01225 }
01226
01227 free_sparse_feature_vector(sv, vec_idx1, vfree);
01228
01229 return result;
01230 }
01231
01233 inline virtual const char* get_name() const { return "SparseFeatures"; }
01234
01235 protected:
01246 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01247 {
01248 len=0;
01249 return NULL;
01250 }
01251
01252 protected:
01253
01255 int32_t num_vectors;
01256
01258 int32_t num_features;
01259
01261 TSparse<ST>* sparse_feature_matrix;
01262
01264 CCache< TSparseEntry<ST> >* feature_cache;
01265 };
01266
01271 template<> inline EFeatureType CSparseFeatures<bool>::get_feature_type()
01272 {
01273 return F_BOOL;
01274 }
01275
01280 template<> inline EFeatureType CSparseFeatures<char>::get_feature_type()
01281 {
01282 return F_CHAR;
01283 }
01284
01289 template<> inline EFeatureType CSparseFeatures<uint8_t>::get_feature_type()
01290 {
01291 return F_BYTE;
01292 }
01293
01298 template<> inline EFeatureType CSparseFeatures<int16_t>::get_feature_type()
01299 {
01300 return F_SHORT;
01301 }
01302
01307 template<> inline EFeatureType CSparseFeatures<uint16_t>::get_feature_type()
01308 {
01309 return F_WORD;
01310 }
01311
01316 template<> inline EFeatureType CSparseFeatures<int32_t>::get_feature_type()
01317 {
01318 return F_INT;
01319 }
01320
01325 template<> inline EFeatureType CSparseFeatures<uint32_t>::get_feature_type()
01326 {
01327 return F_UINT;
01328 }
01329
01334 template<> inline EFeatureType CSparseFeatures<int64_t>::get_feature_type()
01335 {
01336 return F_LONG;
01337 }
01338
01343 template<> inline EFeatureType CSparseFeatures<uint64_t>::get_feature_type()
01344 {
01345 return F_ULONG;
01346 }
01347
01352 template<> inline EFeatureType CSparseFeatures<float32_t>::get_feature_type()
01353 {
01354 return F_SHORTREAL;
01355 }
01356
01361 template<> inline EFeatureType CSparseFeatures<float64_t>::get_feature_type()
01362 {
01363 return F_DREAL;
01364 }
01365
01370 template<> inline EFeatureType CSparseFeatures<floatmax_t>::get_feature_type()
01371 {
01372 return F_LONGREAL;
01373 }
01374 #endif