00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <string.h>
00012 #include <sys/types.h>
00013 #include <sys/stat.h>
00014 #include <unistd.h>
00015 #include <ctype.h>
00016
00017 #include "lib/File.h"
00018 #include "lib/SimpleFile.h"
00019
00020 #include "features/StringFeatures.h"
00021 #include "features/SparseFeatures.h"
00022
00023
00024 CFile::CFile(FILE* f)
00025 : CSGObject()
00026 {
00027 file=f;
00028 filename=NULL;
00029 expected_type=F_UNKNOWN;
00030 }
00031
00032 CFile::CFile(char* fname, char rw, EFeatureType typ, char file_fourcc[4])
00033 : CSGObject()
00034 {
00035 status=false;
00036 task=rw;
00037 expected_type=typ;
00038 filename=strdup(fname);
00039 char mode[2];
00040 mode[0]=rw;
00041 mode[1]='\0';
00042
00043
00044 if (rw=='r' || rw == 'w')
00045 {
00046 if (filename)
00047 {
00048 if ((file=fopen((const char*) filename, (const char*) mode)))
00049 status=true;
00050 }
00051 }
00052 else
00053 SG_ERROR("unknown mode '%c'\n", mode[0]);
00054
00055 if (file_fourcc)
00056 {
00057 if (rw=='r')
00058 status=read_header();
00059 else if (rw=='w')
00060 status=write_header();
00061
00062 if (!status)
00063 fclose(file);
00064
00065 file=NULL;
00066 }
00067 }
00068
00069 CFile::~CFile()
00070 {
00071 free(filename);
00072 if (file)
00073 fclose(file);
00074 filename=NULL;
00075 file=NULL;
00076 }
00077
00078 int32_t* CFile::load_int_data(int32_t* target, int64_t& num)
00079 {
00080 ASSERT(expected_type==F_INT);
00081 CSimpleFile<int32_t> f(filename, file);
00082 target=f.load(target, num);
00083 status=(target!=NULL);
00084 return target;
00085 }
00086
00087 bool CFile::save_int_data(int32_t* src, int64_t num)
00088 {
00089 ASSERT(expected_type==F_INT);
00090 CSimpleFile<int32_t> f(filename, file);
00091 status=f.save(src, num);
00092 return status;
00093 }
00094
00095 float64_t* CFile::load_real_data(float64_t* target, int64_t& num)
00096 {
00097 ASSERT(expected_type==F_DREAL);
00098 CSimpleFile<float64_t> f(filename, file);
00099 target=f.load(target, num);
00100 status=(target!=NULL);
00101 return target;
00102 }
00103
00104 float32_t* CFile::load_shortreal_data(float32_t* target, int64_t& num)
00105 {
00106 ASSERT(expected_type==F_SHORTREAL);
00107 CSimpleFile<float32_t> f(filename, file);
00108 target=f.load(target, num);
00109 status=(target!=NULL);
00110 return target;
00111 }
00112
00113 bool CFile::save_real_data(float64_t* src, int64_t num)
00114 {
00115 ASSERT(expected_type==F_DREAL);
00116 CSimpleFile<float64_t> f(filename, file);
00117 status=f.save(src, num);
00118 return status;
00119 }
00120
00121 bool CFile::save_shortreal_data(float32_t* src, int64_t num)
00122 {
00123 ASSERT(expected_type==F_SHORTREAL);
00124 CSimpleFile<float32_t> f(filename, file);
00125 status=f.save(src, num);
00126 return status;
00127 }
00128
00129 char* CFile::load_char_data(char* target, int64_t& num)
00130 {
00131 ASSERT(expected_type==F_CHAR);
00132 CSimpleFile<char> f(filename, file);
00133 target=f.load(target, num);
00134 status=(target!=NULL);
00135 return target;
00136 }
00137
00138 bool CFile::save_char_data(char* src, int64_t num)
00139 {
00140 ASSERT(expected_type==F_CHAR);
00141 CSimpleFile<char> f(filename, file);
00142 status=f.save(src, num);
00143 return status;
00144 }
00145
00146 uint8_t* CFile::load_byte_data(uint8_t* target, int64_t& num)
00147 {
00148 ASSERT(expected_type==F_BYTE);
00149 CSimpleFile<uint8_t> f(filename, file);
00150 target=f.load(target, num);
00151 status=(target!=NULL);
00152 return target;
00153 }
00154
00155 bool CFile::save_byte_data(uint8_t* src, int64_t num)
00156 {
00157 ASSERT(expected_type==F_BYTE);
00158 CSimpleFile<uint8_t> f(filename, file);
00159 status=f.save(src, num);
00160 return status;
00161 }
00162
00163 uint16_t* CFile::load_word_data(uint16_t* target, int64_t& num)
00164 {
00165 ASSERT(expected_type==F_WORD);
00166 CSimpleFile<uint16_t> f(filename, file);
00167 target=f.load(target, num);
00168 status=(target!=NULL);
00169 return target;
00170 }
00171
00172 bool CFile::save_word_data(uint16_t* src, int64_t num)
00173 {
00174 ASSERT(expected_type==F_WORD);
00175 CSimpleFile<uint16_t> f(filename, file);
00176 status=f.save(src, num);
00177 return status;
00178 }
00179
00180 int16_t* CFile::load_short_data(int16_t* target, int64_t& num)
00181 {
00182 ASSERT(expected_type==F_SHORT);
00183 CSimpleFile<int16_t> f(filename, file);
00184 target=f.load(target, num);
00185 status=(target!=NULL);
00186 return target;
00187 }
00188
00189 bool CFile::save_short_data(int16_t* src, int64_t num)
00190 {
00191 ASSERT(expected_type==F_SHORT);
00192 CSimpleFile<int16_t> f(filename, file);
00193 status=f.save(src, num);
00194 return status;
00195 }
00196
00197 int32_t CFile::parse_first_header(EFeatureType &type)
00198 {
00199 return -1;
00200 }
00201
00202 int32_t CFile::parse_next_header(EFeatureType &type)
00203 {
00204 return -1;
00205 }
00206
00207
00208 bool CFile::read_header()
00209 {
00210 ASSERT(file);
00211 uint32_t intlen=0;
00212 uint32_t endian=0;
00213 uint32_t file_fourcc=0;
00214 uint32_t doublelen=0;
00215
00216 if ( (fread(&intlen, sizeof(uint8_t), 1, file)==1) &&
00217 (fread(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00218 (fread(&endian, (uint32_t) intlen, 1, file)== 1) &&
00219 (fread(&file_fourcc, (uint32_t) intlen, 1, file)==1))
00220 return true;
00221 else
00222 return false;
00223 }
00224
00225 bool CFile::write_header()
00226 {
00227 uint8_t intlen=sizeof(uint32_t);
00228 uint8_t doublelen=sizeof(double);
00229 uint32_t endian=0x12345678;
00230
00231 if ((fwrite(&intlen, sizeof(uint8_t), 1, file)==1) &&
00232 (fwrite(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00233 (fwrite(&endian, sizeof(uint32_t), 1, file)==1) &&
00234 (fwrite(&fourcc, 4*sizeof(char), 1, file)==1))
00235 return true;
00236 else
00237 return false;
00238 }
00239
00240 template <class T> void CFile::append_item(
00241 CDynamicArray<T>* items, char* ptr_data, char* ptr_item)
00242 {
00243 size_t len=(ptr_data-ptr_item)/sizeof(char);
00244 char* item=new char[len+1];
00245 memset(item, 0, sizeof(char)*(len+1));
00246 item=strncpy(item, ptr_item, len);
00247
00248 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00249 items->append_element(item);
00250 }
00251
00252 bool CFile::read_real_valued_dense(
00253 float64_t*& matrix, int32_t& num_feat, int32_t& num_vec)
00254 {
00255 ASSERT(expected_type==F_DREAL);
00256
00257 struct stat stats;
00258 if (stat(filename, &stats)!=0)
00259 SG_ERROR("Could not get file statistics.\n");
00260
00261 char* data=new char[stats.st_size+1];
00262 memset(data, 0, sizeof(char)*(stats.st_size+1));
00263 size_t nread=fread(data, sizeof(char), stats.st_size, file);
00264 if (nread<=0)
00265 SG_ERROR("Could not read data from %s.\n");
00266
00267 SG_DEBUG("data read from file:\n%s\n", data);
00268
00269
00270 int32_t nf=0;
00271 num_feat=0;
00272 num_vec=0;
00273 char* ptr_item=NULL;
00274 char* ptr_data=data;
00275 CDynamicArray<char*>* items=new CDynamicArray<char*>();
00276
00277 while (*ptr_data)
00278 {
00279 if (*ptr_data=='\n')
00280 {
00281 if (ptr_item)
00282 nf++;
00283
00284 if (num_feat!=0 && nf!=num_feat)
00285 SG_ERROR("Number of features mismatches (%d != %d) in vector %d in file %s.\n", num_feat, nf, num_vec, filename);
00286
00287 append_item(items, ptr_data, ptr_item);
00288 num_feat=nf;
00289 num_vec++;
00290 nf=0;
00291 ptr_item=NULL;
00292 }
00293 else if (!isblank(*ptr_data) && !ptr_item)
00294 {
00295 ptr_item=ptr_data;
00296 }
00297 else if (isblank(*ptr_data) && ptr_item)
00298 {
00299 append_item(items, ptr_data, ptr_item);
00300 ptr_item=NULL;
00301 nf++;
00302 }
00303
00304 ptr_data++;
00305 }
00306
00307 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);
00308 delete[] data;
00309
00310
00311 matrix=new float64_t[num_vec*num_feat];
00312 for (int32_t i=0; i<num_vec; i++)
00313 {
00314 for (int32_t j=0; j<num_feat; j++)
00315 {
00316 char* item=items->get_element(i*num_feat+j);
00317 matrix[i*num_feat+j]=atof(item);
00318 delete[] item;
00319 }
00320 }
00321 delete items;
00322
00323
00324 return true;
00325 }
00326
00327 bool CFile::write_real_valued_dense(
00328 const float64_t* matrix, int32_t num_feat, int32_t num_vec)
00329 {
00330 if (!(file && matrix))
00331 SG_ERROR("File or matrix invalid.\n");
00332
00333 for (int32_t i=0; i<num_feat; i++)
00334 {
00335 for (int32_t j=0; j<num_vec; j++)
00336 {
00337 float64_t v=matrix[num_feat*j+i];
00338 if (j==num_vec-1)
00339 fprintf(file, "%f\n", v);
00340 else
00341 fprintf(file, "%f ", v);
00342 }
00343 }
00344
00345 return true;
00346 }
00347
00348 bool CFile::read_real_valued_sparse(
00349 TSparse<float64_t>*& matrix, int32_t& num_feat, int32_t& num_vec)
00350 {
00351 size_t blocksize=1024*1024;
00352 size_t required_blocksize=blocksize;
00353 uint8_t* dummy=new uint8_t[blocksize];
00354
00355 if (file)
00356 {
00357 num_vec=0;
00358 num_feat=0;
00359
00360 SG_INFO("counting line numbers in file %s\n", filename);
00361 size_t sz=blocksize;
00362 size_t block_offs=0;
00363 size_t old_block_offs=0;
00364 fseek(file, 0, SEEK_END);
00365 size_t fsize=ftell(file);
00366 rewind(file);
00367
00368 while (sz == blocksize)
00369 {
00370 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00371 bool contains_cr=false;
00372 for (size_t i=0; i<sz; i++)
00373 {
00374 block_offs++;
00375 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00376 {
00377 num_vec++;
00378 contains_cr=true;
00379 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00380 old_block_offs=block_offs;
00381 }
00382 }
00383 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00384 }
00385
00386 SG_INFO("found %d feature vectors\n", num_vec);
00387 delete[] dummy;
00388 blocksize=required_blocksize;
00389 dummy = new uint8_t[blocksize+1];
00390 matrix=new TSparse<float64_t>[num_vec];
00391
00392 rewind(file);
00393 sz=blocksize;
00394 int32_t lines=0;
00395 while (sz == blocksize)
00396 {
00397 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00398
00399 size_t old_sz=0;
00400 for (size_t i=0; i<sz; i++)
00401 {
00402 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00403 {
00404 size_t len=i-old_sz+1;
00405 uint8_t* data=&dummy[old_sz];
00406
00407 for (size_t j=0; j<len; j++)
00408 dummy[j]=data[j];
00409
00410 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);
00411 i=0;
00412 old_sz=0;
00413 sz+=len;
00414 }
00415
00416 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00417 {
00418
00419 size_t len=i-old_sz;
00420 uint8_t* data=&dummy[old_sz];
00421
00422 int32_t dims=0;
00423 for (size_t j=0; j<len; j++)
00424 {
00425 if (data[j]==':')
00426 dims++;
00427 }
00428
00429 if (dims<=0)
00430 {
00431 SG_ERROR("Error in line %d - number of"
00432 " dimensions is %d line is %d characters"
00433 " long\n line_content:'%.*s'\n", lines,
00434 dims, len, len, (const char*) data);
00435 }
00436
00437 TSparseEntry<float64_t>* feat=new TSparseEntry<float64_t>[dims];
00438
00439
00440 size_t j=0;
00441 for (; j<len; j++)
00442 {
00443 if (data[j]==':')
00444 {
00445 j=-1;
00446 break;
00447 }
00448
00449 if (data[j]==' ')
00450 {
00451 data[j]='\0';
00452
00453
00454 break;
00455 }
00456 }
00457
00458 int32_t d=0;
00459 j++;
00460 uint8_t* start=&data[j];
00461 for (; j<len; j++)
00462 {
00463 if (data[j]==':')
00464 {
00465 data[j]='\0';
00466
00467 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00468 num_feat=CMath::max(num_feat, feat[d].feat_index+1);
00469
00470 j++;
00471 start=&data[j];
00472 for (; j<len; j++)
00473 {
00474 if (data[j]==' ' || data[j]=='\n')
00475 {
00476 data[j]='\0';
00477 feat[d].entry=(float64_t) atof((const char*) start);
00478 d++;
00479 break;
00480 }
00481 }
00482
00483 if (j==len)
00484 {
00485 data[j]='\0';
00486 feat[dims-1].entry=(float64_t) atof((const char*) start);
00487 }
00488
00489 j++;
00490 start=&data[j];
00491 }
00492 }
00493
00494 matrix[lines].vec_index=lines;
00495 matrix[lines].num_feat_entries=dims;
00496 matrix[lines].features=feat;
00497
00498 old_sz=i+1;
00499 lines++;
00500 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");
00501 }
00502 }
00503 }
00504
00505 SG_INFO("file successfully read\n");
00506 }
00507
00508 delete[] dummy;
00509 return true;
00510 }
00511
00512 bool CFile::write_real_valued_sparse(
00513 const TSparse<float64_t>* matrix, int32_t num_feat, int32_t num_vec)
00514 {
00515 if (!(file && matrix))
00516 SG_ERROR("File or matrix invalid.\n");
00517
00518 for (int32_t i=0; i<num_vec; i++)
00519 {
00520 TSparseEntry<float64_t>* vec = matrix[i].features;
00521 int32_t len=matrix[i].num_feat_entries;
00522
00523 for (int32_t j=0; j<len; j++)
00524 {
00525 if (j<len-1)
00526 fprintf(file, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00527 else
00528 fprintf(file, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00529 }
00530 }
00531
00532 return true;
00533 }
00534
00535
00536 bool CFile::read_char_valued_strings(
00537 T_STRING<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00538 {
00539 bool result=false;
00540
00541 size_t blocksize=1024*1024;
00542 size_t required_blocksize=0;
00543 char* dummy=new char[blocksize];
00544 char* overflow=NULL;
00545 int32_t overflow_len=0;
00546
00547 if (file)
00548 {
00549 num_str=0;
00550 max_string_len=0;
00551
00552 SG_INFO("counting line numbers in file %s\n", filename);
00553 size_t sz=blocksize;
00554 size_t block_offs=0;
00555 size_t old_block_offs=0;
00556 fseek(file, 0, SEEK_END);
00557 size_t fsize=ftell(file);
00558 rewind(file);
00559
00560 while (sz == blocksize)
00561 {
00562 sz=fread(dummy, sizeof(char), blocksize, file);
00563 bool contains_cr=false;
00564 for (size_t i=0; i<sz; i++)
00565 {
00566 block_offs++;
00567 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00568 {
00569 num_str++;
00570 contains_cr=true;
00571 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00572 old_block_offs=block_offs;
00573 }
00574 }
00575 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00576 }
00577
00578 SG_INFO("found %d strings\n", num_str);
00579 SG_DEBUG("block_size=%d\n", required_blocksize);
00580 delete[] dummy;
00581 blocksize=required_blocksize;
00582 dummy=new char[blocksize];
00583 overflow=new char[blocksize];
00584 strings=new T_STRING<char>[num_str];
00585
00586 rewind(file);
00587 sz=blocksize;
00588 int32_t lines=0;
00589 size_t old_sz=0;
00590 while (sz == blocksize)
00591 {
00592 sz=fread(dummy, sizeof(char), blocksize, file);
00593
00594 old_sz=0;
00595 for (size_t i=0; i<sz; i++)
00596 {
00597 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00598 {
00599 int32_t len=i-old_sz;
00600 max_string_len=CMath::max(max_string_len, len+overflow_len);
00601
00602 strings[lines].length=len+overflow_len;
00603 strings[lines].string=new char[len+overflow_len];
00604
00605 for (int32_t j=0; j<overflow_len; j++)
00606 strings[lines].string[j]=overflow[j];
00607 for (int32_t j=0; j<len; j++)
00608 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00609
00610
00611 overflow_len=0;
00612
00613
00614 old_sz=i+1;
00615 lines++;
00616 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00617 }
00618 }
00619
00620 for (size_t i=old_sz; i<sz; i++)
00621 overflow[i-old_sz]=dummy[i];
00622
00623 overflow_len=sz-old_sz;
00624 }
00625 result=true;
00626 SG_INFO("file successfully read\n");
00627 SG_INFO("max_string_length=%d\n", max_string_len);
00628 SG_INFO("num_strings=%d\n", num_str);
00629 }
00630
00631 delete[] dummy;
00632 delete[] overflow;
00633
00634 return result;
00635 }
00636
00637 bool CFile::write_char_valued_strings(
00638 const T_STRING<char>* strings, int32_t num_str)
00639 {
00640 if (!(file && strings))
00641 SG_ERROR("File or strings invalid.\n");
00642
00643 for (int32_t i=0; i<num_str; i++)
00644 {
00645 int32_t len = strings[i].length;
00646 fwrite(strings[i].string, sizeof(char), len, file);
00647 fprintf(file, "\n");
00648 }
00649
00650 return true;
00651 }
00652
00653