File.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <sys/types.h>
00013 #include <sys/stat.h>
00014 #include <unistd.h>
00015 #include <ctype.h>
00016 
00017 #include "lib/File.h"
00018 #include "lib/SimpleFile.h"
00019 
00020 #include "features/StringFeatures.h"
00021 #include "features/SparseFeatures.h"
00022 
00023 
00024 CFile::CFile(FILE* f)
00025 : CSGObject()
00026 {
00027     file=f;
00028     filename=NULL;
00029     expected_type=F_UNKNOWN;
00030 }
00031 
00032 CFile::CFile(char* fname, char rw, EFeatureType typ, char file_fourcc[4])
00033 : CSGObject()
00034 {
00035     status=false;
00036     task=rw;
00037     expected_type=typ;
00038     filename=strdup(fname);
00039     char mode[2];
00040     mode[0]=rw;
00041     mode[1]='\0';
00042 
00043 
00044     if (rw=='r' || rw == 'w')
00045     {
00046         if (filename)
00047         {
00048             if ((file=fopen((const char*) filename, (const char*) mode)))
00049                 status=true;
00050         }
00051     }
00052     else
00053         SG_ERROR("unknown mode '%c'\n", mode[0]);
00054 
00055     if (file_fourcc)
00056     {
00057         if (rw=='r')
00058             status=read_header();
00059         else if (rw=='w')
00060             status=write_header();
00061 
00062         if (!status)
00063             fclose(file);
00064 
00065         file=NULL;
00066     }
00067 }
00068 
00069 CFile::~CFile()
00070 {
00071     free(filename);
00072     if (file)
00073       fclose(file);
00074     filename=NULL;
00075     file=NULL;
00076 }
00077 
00078 int32_t* CFile::load_int_data(int32_t* target, int64_t& num)
00079 {
00080     ASSERT(expected_type==F_INT);
00081     CSimpleFile<int32_t> f(filename, file);
00082     target=f.load(target, num);
00083     status=(target!=NULL);
00084     return target;
00085 }
00086 
00087 bool CFile::save_int_data(int32_t* src, int64_t num)
00088 {
00089     ASSERT(expected_type==F_INT);
00090     CSimpleFile<int32_t> f(filename, file);
00091     status=f.save(src, num);
00092     return status;
00093 }
00094 
00095 float64_t* CFile::load_real_data(float64_t* target, int64_t& num)
00096 {
00097     ASSERT(expected_type==F_DREAL);
00098     CSimpleFile<float64_t> f(filename, file);
00099     target=f.load(target, num);
00100     status=(target!=NULL);
00101     return target;
00102 }
00103 
00104 float32_t* CFile::load_shortreal_data(float32_t* target, int64_t& num)
00105 {
00106     ASSERT(expected_type==F_SHORTREAL);
00107     CSimpleFile<float32_t> f(filename, file);
00108     target=f.load(target, num);
00109     status=(target!=NULL);
00110     return target;
00111 }
00112 
00113 bool CFile::save_real_data(float64_t* src, int64_t num)
00114 {
00115     ASSERT(expected_type==F_DREAL);
00116     CSimpleFile<float64_t> f(filename, file);
00117     status=f.save(src, num);
00118     return status;
00119 }
00120 
00121 bool CFile::save_shortreal_data(float32_t* src, int64_t num)
00122 {
00123     ASSERT(expected_type==F_SHORTREAL);
00124     CSimpleFile<float32_t> f(filename, file);
00125     status=f.save(src, num);
00126     return status;
00127 }
00128 
00129 char* CFile::load_char_data(char* target, int64_t& num)
00130 {
00131     ASSERT(expected_type==F_CHAR);
00132     CSimpleFile<char> f(filename, file);
00133     target=f.load(target, num);
00134     status=(target!=NULL);
00135     return target;
00136 }
00137 
00138 bool CFile::save_char_data(char* src, int64_t num)
00139 {
00140     ASSERT(expected_type==F_CHAR);
00141     CSimpleFile<char> f(filename, file);
00142     status=f.save(src, num);
00143     return status;
00144 }
00145 
00146 uint8_t* CFile::load_byte_data(uint8_t* target, int64_t& num)
00147 {
00148     ASSERT(expected_type==F_BYTE);
00149     CSimpleFile<uint8_t> f(filename, file);
00150     target=f.load(target, num);
00151     status=(target!=NULL);
00152     return target;
00153 }
00154 
00155 bool CFile::save_byte_data(uint8_t* src, int64_t num)
00156 {
00157     ASSERT(expected_type==F_BYTE);
00158     CSimpleFile<uint8_t> f(filename, file);
00159     status=f.save(src, num);
00160     return status;
00161 }
00162 
00163 uint16_t* CFile::load_word_data(uint16_t* target, int64_t& num)
00164 {
00165     ASSERT(expected_type==F_WORD);
00166     CSimpleFile<uint16_t> f(filename, file);
00167     target=f.load(target, num);
00168     status=(target!=NULL);
00169     return target;
00170 }
00171 
00172 bool CFile::save_word_data(uint16_t* src, int64_t num)
00173 {
00174     ASSERT(expected_type==F_WORD);
00175     CSimpleFile<uint16_t> f(filename, file);
00176     status=f.save(src, num);
00177     return status;
00178 }
00179 
00180 int16_t* CFile::load_short_data(int16_t* target, int64_t& num)
00181 {
00182     ASSERT(expected_type==F_SHORT);
00183     CSimpleFile<int16_t> f(filename, file);
00184     target=f.load(target, num);
00185     status=(target!=NULL);
00186     return target;
00187 }
00188 
00189 bool CFile::save_short_data(int16_t* src, int64_t num)
00190 {
00191     ASSERT(expected_type==F_SHORT);
00192     CSimpleFile<int16_t> f(filename, file);
00193     status=f.save(src, num);
00194     return status;
00195 }
00196 
00197 int32_t CFile::parse_first_header(EFeatureType &type)
00198 {
00199     return -1;
00200 }
00201 
00202 int32_t CFile::parse_next_header(EFeatureType &type)
00203 {
00204     return -1;
00205 }
00206 
00207 
00208 bool CFile::read_header()
00209 {
00210     ASSERT(file);
00211     uint32_t intlen=0;
00212     uint32_t endian=0;
00213     uint32_t file_fourcc=0;
00214     uint32_t doublelen=0;
00215 
00216     if ( (fread(&intlen, sizeof(uint8_t), 1, file)==1) &&
00217             (fread(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00218             (fread(&endian, (uint32_t) intlen, 1, file)== 1) &&
00219             (fread(&file_fourcc, (uint32_t) intlen, 1, file)==1))
00220         return true;
00221     else
00222         return false;
00223 }
00224 
00225 bool CFile::write_header()
00226 {
00227     uint8_t intlen=sizeof(uint32_t);
00228     uint8_t doublelen=sizeof(double);
00229     uint32_t endian=0x12345678;
00230 
00231     if ((fwrite(&intlen, sizeof(uint8_t), 1, file)==1) &&
00232             (fwrite(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00233             (fwrite(&endian, sizeof(uint32_t), 1, file)==1) &&
00234             (fwrite(&fourcc, 4*sizeof(char), 1, file)==1))
00235         return true;
00236     else
00237         return false;
00238 }
00239 
00240 template <class T> void CFile::append_item(
00241     CDynamicArray<T>* items, char* ptr_data, char* ptr_item)
00242 {
00243     size_t len=(ptr_data-ptr_item)/sizeof(char);
00244     char* item=new char[len+1];
00245     memset(item, 0, sizeof(char)*(len+1));
00246     item=strncpy(item, ptr_item, len);
00247 
00248     SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00249     items->append_element(item);
00250 }
00251 
00252 bool CFile::read_real_valued_dense(
00253     float64_t*& matrix, int32_t& num_feat, int32_t& num_vec)
00254 {
00255     ASSERT(expected_type==F_DREAL);
00256 
00257     struct stat stats;
00258     if (stat(filename, &stats)!=0)
00259         SG_ERROR("Could not get file statistics.\n");
00260 
00261     char* data=new char[stats.st_size+1];
00262     memset(data, 0, sizeof(char)*(stats.st_size+1));
00263     size_t nread=fread(data, sizeof(char), stats.st_size, file);
00264     if (nread<=0)
00265         SG_ERROR("Could not read data from %s.\n");
00266 
00267     SG_DEBUG("data read from file:\n%s\n", data);
00268 
00269     // determine num_feat and num_vec, populate dynamic array
00270     int32_t nf=0;
00271     num_feat=0;
00272     num_vec=0;
00273     char* ptr_item=NULL;
00274     char* ptr_data=data;
00275     CDynamicArray<char*>* items=new CDynamicArray<char*>();
00276 
00277     while (*ptr_data)
00278     {
00279         if (*ptr_data=='\n')
00280         {
00281             if (ptr_item)
00282                 nf++;
00283 
00284             if (num_feat!=0 && nf!=num_feat)
00285                 SG_ERROR("Number of features mismatches (%d != %d) in vector %d in file %s.\n", num_feat, nf, num_vec, filename);
00286 
00287             append_item(items, ptr_data, ptr_item);
00288             num_feat=nf;
00289             num_vec++;
00290             nf=0;
00291             ptr_item=NULL;
00292         }
00293         else if (!isblank(*ptr_data) && !ptr_item)
00294         {
00295             ptr_item=ptr_data;
00296         }
00297         else if (isblank(*ptr_data) && ptr_item)
00298         {
00299             append_item(items, ptr_data, ptr_item);
00300             ptr_item=NULL;
00301             nf++;
00302         }
00303 
00304         ptr_data++;
00305     }
00306 
00307     SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);
00308     delete[] data;
00309 
00310     // now copy data into matrix
00311     matrix=new float64_t[num_vec*num_feat];
00312     for (int32_t i=0; i<num_vec; i++)
00313     {
00314         for (int32_t j=0; j<num_feat; j++)
00315         {
00316             char* item=items->get_element(i*num_feat+j);
00317             matrix[i*num_feat+j]=atof(item);
00318             delete[] item;
00319         }
00320     }
00321     delete items;
00322 
00323     //CMath::display_matrix(matrix, num_feat, num_vec);
00324     return true;
00325 }
00326 
00327 bool CFile::write_real_valued_dense(
00328     const float64_t* matrix, int32_t num_feat, int32_t num_vec)
00329 {
00330     if (!(file && matrix))
00331         SG_ERROR("File or matrix invalid.\n");
00332 
00333     for (int32_t i=0; i<num_feat; i++)
00334     {
00335         for (int32_t j=0; j<num_vec; j++)
00336         {
00337             float64_t v=matrix[num_feat*j+i];
00338             if (j==num_vec-1)
00339                 fprintf(file, "%f\n", v);
00340             else
00341                 fprintf(file, "%f ", v);
00342         }
00343     }
00344 
00345     return true;
00346 }
00347 
00348 bool CFile::read_real_valued_sparse(
00349     TSparse<float64_t>*& matrix, int32_t& num_feat, int32_t& num_vec)
00350 {
00351     size_t blocksize=1024*1024;
00352     size_t required_blocksize=blocksize;
00353     uint8_t* dummy=new uint8_t[blocksize];
00354 
00355     if (file)
00356     {
00357         num_vec=0;
00358         num_feat=0;
00359 
00360         SG_INFO("counting line numbers in file %s\n", filename);
00361         size_t sz=blocksize;
00362         size_t block_offs=0;
00363         size_t old_block_offs=0;
00364         fseek(file, 0, SEEK_END);
00365         size_t fsize=ftell(file);
00366         rewind(file);
00367 
00368         while (sz == blocksize)
00369         {
00370             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00371             bool contains_cr=false;
00372             for (size_t i=0; i<sz; i++)
00373             {
00374                 block_offs++;
00375                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00376                 {
00377                     num_vec++;
00378                     contains_cr=true;
00379                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00380                     old_block_offs=block_offs;
00381                 }
00382             }
00383             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00384         }
00385 
00386         SG_INFO("found %d feature vectors\n", num_vec);
00387         delete[] dummy;
00388         blocksize=required_blocksize;
00389         dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
00390         matrix=new TSparse<float64_t>[num_vec];
00391 
00392         rewind(file);
00393         sz=blocksize;
00394         int32_t lines=0;
00395         while (sz == blocksize)
00396         {
00397             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00398 
00399             size_t old_sz=0;
00400             for (size_t i=0; i<sz; i++)
00401             {
00402                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00403                 {
00404                     size_t len=i-old_sz+1;
00405                     uint8_t* data=&dummy[old_sz];
00406 
00407                     for (size_t j=0; j<len; j++)
00408                         dummy[j]=data[j];
00409 
00410                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);
00411                     i=0;
00412                     old_sz=0;
00413                     sz+=len;
00414                 }
00415 
00416                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00417                 {
00418 
00419                     size_t len=i-old_sz;
00420                     uint8_t* data=&dummy[old_sz];
00421 
00422                     int32_t dims=0;
00423                     for (size_t j=0; j<len; j++)
00424                     {
00425                         if (data[j]==':')
00426                             dims++;
00427                     }
00428 
00429                     if (dims<=0)
00430                     {
00431                         SG_ERROR("Error in line %d - number of"
00432                                 " dimensions is %d line is %d characters"
00433                                 " long\n line_content:'%.*s'\n", lines,
00434                                 dims, len, len, (const char*) data);
00435                     }
00436 
00437                     TSparseEntry<float64_t>* feat=new TSparseEntry<float64_t>[dims];
00438 
00439                     //skip label part
00440                     size_t j=0;
00441                     for (; j<len; j++)
00442                     {
00443                         if (data[j]==':')
00444                         {
00445                             j=-1; //file without label
00446                             break;
00447                         }
00448 
00449                         if (data[j]==' ')
00450                         {
00451                             data[j]='\0';
00452 
00453                             //skip label part
00454                             break;
00455                         }
00456                     }
00457 
00458                     int32_t d=0;
00459                     j++;
00460                     uint8_t* start=&data[j];
00461                     for (; j<len; j++)
00462                     {
00463                         if (data[j]==':')
00464                         {
00465                             data[j]='\0';
00466 
00467                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00468                             num_feat=CMath::max(num_feat, feat[d].feat_index+1);
00469 
00470                             j++;
00471                             start=&data[j];
00472                             for (; j<len; j++)
00473                             {
00474                                 if (data[j]==' ' || data[j]=='\n')
00475                                 {
00476                                     data[j]='\0';
00477                                     feat[d].entry=(float64_t) atof((const char*) start);
00478                                     d++;
00479                                     break;
00480                                 }
00481                             }
00482 
00483                             if (j==len)
00484                             {
00485                                 data[j]='\0';
00486                                 feat[dims-1].entry=(float64_t) atof((const char*) start);
00487                             }
00488 
00489                             j++;
00490                             start=&data[j];
00491                         }
00492                     }
00493 
00494                     matrix[lines].vec_index=lines;
00495                     matrix[lines].num_feat_entries=dims;
00496                     matrix[lines].features=feat;
00497 
00498                     old_sz=i+1;
00499                     lines++;
00500                     SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");
00501                 }
00502             }
00503         }
00504 
00505         SG_INFO("file successfully read\n");
00506     }
00507 
00508     delete[] dummy;
00509     return true;
00510 }
00511 
00512 bool CFile::write_real_valued_sparse(
00513     const TSparse<float64_t>* matrix, int32_t num_feat, int32_t num_vec)
00514 {
00515     if (!(file && matrix))
00516         SG_ERROR("File or matrix invalid.\n");
00517 
00518     for (int32_t i=0; i<num_vec; i++)
00519     {
00520         TSparseEntry<float64_t>* vec = matrix[i].features;
00521         int32_t len=matrix[i].num_feat_entries;
00522 
00523         for (int32_t j=0; j<len; j++)
00524         {
00525             if (j<len-1)
00526                 fprintf(file, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00527             else
00528                 fprintf(file, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00529         }
00530     }
00531 
00532     return true;
00533 }
00534 
00535 
00536 bool CFile::read_char_valued_strings(
00537     T_STRING<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00538 {
00539     bool result=false;
00540 
00541     size_t blocksize=1024*1024;
00542     size_t required_blocksize=0;
00543     char* dummy=new char[blocksize];
00544     char* overflow=NULL;
00545     int32_t overflow_len=0;
00546 
00547     if (file)
00548     {
00549         num_str=0;
00550         max_string_len=0;
00551 
00552         SG_INFO("counting line numbers in file %s\n", filename);
00553         size_t sz=blocksize;
00554         size_t block_offs=0;
00555         size_t old_block_offs=0;
00556         fseek(file, 0, SEEK_END);
00557         size_t fsize=ftell(file);
00558         rewind(file);
00559 
00560         while (sz == blocksize)
00561         {
00562             sz=fread(dummy, sizeof(char), blocksize, file);
00563             bool contains_cr=false;
00564             for (size_t i=0; i<sz; i++)
00565             {
00566                 block_offs++;
00567                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00568                 {
00569                     num_str++;
00570                     contains_cr=true;
00571                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00572                     old_block_offs=block_offs;
00573                 }
00574             }
00575             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00576         }
00577 
00578         SG_INFO("found %d strings\n", num_str);
00579         SG_DEBUG("block_size=%d\n", required_blocksize);
00580         delete[] dummy;
00581         blocksize=required_blocksize;
00582         dummy=new char[blocksize];
00583         overflow=new char[blocksize];
00584         strings=new T_STRING<char>[num_str];
00585 
00586         rewind(file);
00587         sz=blocksize;
00588         int32_t lines=0;
00589         size_t old_sz=0;
00590         while (sz == blocksize)
00591         {
00592             sz=fread(dummy, sizeof(char), blocksize, file);
00593 
00594             old_sz=0;
00595             for (size_t i=0; i<sz; i++)
00596             {
00597                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00598                 {
00599                     int32_t len=i-old_sz;
00600                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00601 
00602                     strings[lines].length=len+overflow_len;
00603                     strings[lines].string=new char[len+overflow_len];
00604 
00605                     for (int32_t j=0; j<overflow_len; j++)
00606                         strings[lines].string[j]=overflow[j];
00607                     for (int32_t j=0; j<len; j++)
00608                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00609 
00610                     // clear overflow
00611                     overflow_len=0;
00612 
00613                     //CMath::display_vector(strings[lines].string, len);
00614                     old_sz=i+1;
00615                     lines++;
00616                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00617                 }
00618             }
00619 
00620             for (size_t i=old_sz; i<sz; i++)
00621                 overflow[i-old_sz]=dummy[i];
00622 
00623             overflow_len=sz-old_sz;
00624         }
00625         result=true;
00626         SG_INFO("file successfully read\n");
00627         SG_INFO("max_string_length=%d\n", max_string_len);
00628         SG_INFO("num_strings=%d\n", num_str);
00629     }
00630 
00631     delete[] dummy;
00632     delete[] overflow;
00633 
00634     return result;
00635 }
00636 
00637 bool CFile::write_char_valued_strings(
00638     const T_STRING<char>* strings, int32_t num_str)
00639 {
00640     if (!(file && strings))
00641         SG_ERROR("File or strings invalid.\n");
00642 
00643     for (int32_t i=0; i<num_str; i++)
00644     {
00645         int32_t len = strings[i].length;
00646         fwrite(strings[i].string, sizeof(char), len, file);
00647         fprintf(file, "\n");
00648     }
00649 
00650     return true;
00651 }
00652 
00653 

SHOGUN Machine Learning Toolbox - Documentation