BinaryFile.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include "lib/File.h"
00012 #include "features/SparseFeatures.h"
00013 #include "lib/BinaryFile.h"
00014 
00015 using namespace shogun;
00016 
00017 CBinaryFile::CBinaryFile(FILE* f, const char* name) : CFile(f, name)
00018 {
00019 }
00020 
00021 CBinaryFile::CBinaryFile(char* fname, char rw, const char* name) : CFile(fname, rw, name)
00022 {
00023 }
00024 
00025 CBinaryFile::~CBinaryFile()
00026 {
00027 }
00028 
00029 #define GET_VECTOR(fname, sg_type, datatype)                                        \
00030 void CBinaryFile::fname(sg_type*& vec, int32_t& len)                                \
00031 {                                                                                   \
00032     if (!file)                                                                      \
00033         SG_ERROR("File invalid.\n");                                                \
00034     SGDataType dtype=read_header();                                                 \
00035     if (dtype!=datatype)                                                            \
00036         SG_ERROR("Datatype mismatch\n");                                            \
00037                                                                                     \
00038     if (fread(&len, sizeof(int32_t), 1, file)!=1)                                   \
00039         SG_ERROR("Failed to read vector length\n");                                 \
00040     vec=new sg_type[len];                                                           \
00041     if (fread(vec, sizeof(sg_type), len, file)!=(size_t) len)                       \
00042         SG_ERROR("Failed to read Matrix\n");                                        \
00043 }
00044 
00045 GET_VECTOR(get_byte_vector, uint8_t, DT_VECTOR_BYTE)
00046 GET_VECTOR(get_char_vector, char, DT_VECTOR_CHAR)
00047 GET_VECTOR(get_int_vector, int32_t, DT_VECTOR_INT)
00048 GET_VECTOR(get_shortreal_vector, float32_t, DT_VECTOR_SHORTREAL)
00049 GET_VECTOR(get_real_vector, float64_t, DT_VECTOR_REAL)
00050 GET_VECTOR(get_short_vector, int16_t, DT_VECTOR_SHORT)
00051 GET_VECTOR(get_word_vector, uint16_t, DT_VECTOR_WORD)
00052 #undef GET_VECTOR
00053 
00054 #define GET_MATRIX(fname, sg_type, datatype)                                        \
00055 void CBinaryFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec)      \
00056 {                                                                                   \
00057     if (!file)                                                                      \
00058         SG_ERROR("File invalid.\n");                                                \
00059     SGDataType dtype=read_header();                                                 \
00060     if (dtype!=datatype)                                                            \
00061         SG_ERROR("Datatype mismatch\n");                                            \
00062                                                                                     \
00063     if (fread(&num_feat, sizeof(int32_t), 1, file)!=1 ||                            \
00064             fread(&num_vec, sizeof(int32_t), 1, file)!=1)                           \
00065         SG_ERROR("Failed to read Matrix dimensions\n");                             \
00066     matrix=new sg_type[int64_t(num_feat)*num_vec];                                  \
00067     if (fread(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec)   \
00068         SG_ERROR("Failed to read Matrix\n");                                        \
00069 }
00070 
00071 GET_MATRIX(get_char_matrix, char, DT_DENSE_CHAR)
00072 GET_MATRIX(get_byte_matrix, uint8_t, DT_DENSE_BYTE)
00073 GET_MATRIX(get_int_matrix, int32_t, DT_DENSE_INT)
00074 GET_MATRIX(get_uint_matrix, uint32_t, DT_DENSE_UINT)
00075 GET_MATRIX(get_long_matrix, int64_t, DT_DENSE_LONG)
00076 GET_MATRIX(get_ulong_matrix, uint64_t, DT_DENSE_ULONG)
00077 GET_MATRIX(get_short_matrix, int16_t, DT_DENSE_SHORT)
00078 GET_MATRIX(get_word_matrix, uint16_t, DT_DENSE_WORD)
00079 GET_MATRIX(get_shortreal_matrix, float32_t, DT_DENSE_SHORTREAL)
00080 GET_MATRIX(get_real_matrix, float64_t, DT_DENSE_REAL)
00081 GET_MATRIX(get_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL)
00082 #undef GET_MATRIX
00083 
00084 void CBinaryFile::get_byte_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
00085 {
00086 }
00087 
00088 void CBinaryFile::get_char_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
00089 {
00090 }
00091 
00092 void CBinaryFile::get_int_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
00093 {
00094 }
00095 
00096 void CBinaryFile::get_shortreal_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
00097 {
00098 }
00099 
00100 void CBinaryFile::get_real_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
00101 {
00102 }
00103 
00104 void CBinaryFile::get_short_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
00105 {
00106 }
00107 
00108 void CBinaryFile::get_word_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
00109 {
00110 }
00111 
00112 #define GET_SPARSEMATRIX(fname, sg_type, datatype)                                      \
00113 void CBinaryFile::fname(TSparse<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00114 {                                                                                       \
00115     if (!(file))                                                                        \
00116         SG_ERROR("File invalid.\n");                                                    \
00117                                                                                         \
00118     SGDataType dtype=read_header();                                                     \
00119     if (dtype!=datatype)                                                                \
00120         SG_ERROR("Datatype mismatch\n");                                                \
00121                                                                                         \
00122     if (fread(&num_vec, sizeof(int32_t), 1, file)!=1)                                   \
00123         SG_ERROR("Failed to read number of vectors\n");                                 \
00124                                                                                         \
00125     matrix=new TSparse<sg_type>[num_vec];                                               \
00126                                                                                         \
00127     for (int32_t i=0; i<num_vec; i++)                                                   \
00128     {                                                                                   \
00129         int32_t len=0;                                                                  \
00130         if (fread(&len, sizeof(int32_t), 1, file)!=1)                                   \
00131             SG_ERROR("Failed to read sparse vector length of vector idx=%d\n", i);      \
00132         matrix[i].num_feat_entries=len;                                                 \
00133         TSparseEntry<sg_type>* vec = new TSparseEntry<sg_type>[len];                    \
00134         if (fread(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len)        \
00135             SG_ERROR("Failed to read sparse vector %d\n", i);                           \
00136         matrix[i].features=vec;                                                         \
00137     }                                                                                   \
00138 }
00139 GET_SPARSEMATRIX(get_bool_sparsematrix, bool, DT_SPARSE_BOOL)
00140 GET_SPARSEMATRIX(get_char_sparsematrix, char, DT_SPARSE_CHAR)
00141 GET_SPARSEMATRIX(get_byte_sparsematrix, uint8_t, DT_SPARSE_BYTE)
00142 GET_SPARSEMATRIX(get_int_sparsematrix, int32_t, DT_SPARSE_INT)
00143 GET_SPARSEMATRIX(get_uint_sparsematrix, uint32_t, DT_SPARSE_UINT)
00144 GET_SPARSEMATRIX(get_long_sparsematrix, int64_t, DT_SPARSE_LONG)
00145 GET_SPARSEMATRIX(get_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG)
00146 GET_SPARSEMATRIX(get_short_sparsematrix, int16_t, DT_SPARSE_SHORT)
00147 GET_SPARSEMATRIX(get_word_sparsematrix, uint16_t, DT_SPARSE_WORD)
00148 GET_SPARSEMATRIX(get_shortreal_sparsematrix, float32_t, DT_SPARSE_SHORTREAL)
00149 GET_SPARSEMATRIX(get_real_sparsematrix, float64_t, DT_SPARSE_REAL)
00150 GET_SPARSEMATRIX(get_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL)
00151 #undef GET_SPARSEMATRIX
00152 
00153 
00154 #define GET_STRING_LIST(fname, sg_type, datatype)                                               \
00155 void CBinaryFile::fname(T_STRING<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
00156 {                                                                                               \
00157     strings=NULL;                                                                               \
00158     num_str=0;                                                                                  \
00159     max_string_len=0;                                                                           \
00160                                                                                                 \
00161     if (!file)                                                                                  \
00162         SG_ERROR("File invalid.\n");                                                            \
00163                                                                                                 \
00164     SGDataType dtype=read_header();                                                             \
00165     if (dtype!=datatype)                                                                        \
00166         SG_ERROR("Datatype mismatch\n");                                                        \
00167                                                                                                 \
00168     if (fread(&num_str, sizeof(int32_t), 1, file)!=1)                                           \
00169         SG_ERROR("Failed to read number of strings\n");                                         \
00170                                                                                                 \
00171     strings=new T_STRING<sg_type>[num_str];                                                     \
00172                                                                                                 \
00173     for (int32_t i=0; i<num_str; i++)                                                           \
00174     {                                                                                           \
00175         int32_t len=0;                                                                          \
00176         if (fread(&len, sizeof(int32_t), 1, file)!=1)                                           \
00177             SG_ERROR("Failed to read string length of string with idx=%d\n", i);                \
00178         strings[i].length=len;                                                                  \
00179         sg_type* str = new sg_type[len];                                                        \
00180         if (fread(str, sizeof(sg_type), len, file)!= (size_t) len)                              \
00181             SG_ERROR("Failed to read string %d\n", i);                                          \
00182         strings[i].string=str;                                                                  \
00183     }                                                                                           \
00184 }
00185 
00186 GET_STRING_LIST(get_char_string_list, char, DT_STRING_CHAR)
00187 GET_STRING_LIST(get_byte_string_list, uint8_t, DT_STRING_BYTE)
00188 GET_STRING_LIST(get_int_string_list, int32_t, DT_STRING_INT)
00189 GET_STRING_LIST(get_uint_string_list, uint32_t, DT_STRING_UINT)
00190 GET_STRING_LIST(get_long_string_list, int64_t, DT_STRING_LONG)
00191 GET_STRING_LIST(get_ulong_string_list, uint64_t, DT_STRING_ULONG)
00192 GET_STRING_LIST(get_short_string_list, int16_t, DT_STRING_SHORT)
00193 GET_STRING_LIST(get_word_string_list, uint16_t, DT_STRING_WORD)
00194 GET_STRING_LIST(get_shortreal_string_list, float32_t, DT_STRING_SHORTREAL)
00195 GET_STRING_LIST(get_real_string_list, float64_t, DT_STRING_REAL)
00196 GET_STRING_LIST(get_longreal_string_list, floatmax_t, DT_STRING_LONGREAL)
00197 #undef GET_STRING_LIST
00198 
00201 #define SET_VECTOR(fname, sg_type, dtype)                           \
00202 void CBinaryFile::fname(const sg_type* vec, int32_t len)            \
00203 {                                                                   \
00204     if (!(file && vec))                                             \
00205         SG_ERROR("File or vector invalid.\n");                      \
00206                                                                     \
00207     write_header(dtype);                                            \
00208                                                                     \
00209     if (fwrite(&len, sizeof(int32_t), 1, file)!=1 ||                \
00210             fwrite(vec, sizeof(sg_type), len, file)!=(size_t) len)  \
00211         SG_ERROR("Failed to write vector\n");                       \
00212 }
00213 SET_VECTOR(set_byte_vector, uint8_t, DT_VECTOR_BYTE)
00214 SET_VECTOR(set_char_vector, char, DT_VECTOR_CHAR)
00215 SET_VECTOR(set_int_vector, int32_t, DT_VECTOR_INT)
00216 SET_VECTOR(set_shortreal_vector, float32_t, DT_VECTOR_SHORTREAL)
00217 SET_VECTOR(set_real_vector, float64_t, DT_VECTOR_REAL)
00218 SET_VECTOR(set_short_vector, int16_t, DT_VECTOR_SHORT)
00219 SET_VECTOR(set_word_vector, uint16_t, DT_VECTOR_WORD)
00220 #undef SET_VECTOR
00221 
00222 #define SET_MATRIX(fname, sg_type, dtype) \
00223 void CBinaryFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec)   \
00224 {                                                                                   \
00225     if (!(file && matrix))                                                          \
00226         SG_ERROR("File or matrix invalid.\n");                                      \
00227                                                                                     \
00228     write_header(dtype);                                                            \
00229                                                                                     \
00230     if (fwrite(&num_feat, sizeof(int32_t), 1, file)!=1 ||                           \
00231             fwrite(&num_vec, sizeof(int32_t), 1, file)!=1 ||                        \
00232             fwrite(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec)  \
00233         SG_ERROR("Failed to write Matrix\n");                                       \
00234 }
00235 SET_MATRIX(set_char_matrix, char, DT_DENSE_CHAR)
00236 SET_MATRIX(set_byte_matrix, uint8_t, DT_DENSE_BYTE)
00237 SET_MATRIX(set_int_matrix, int32_t, DT_DENSE_INT)
00238 SET_MATRIX(set_uint_matrix, uint32_t, DT_DENSE_UINT)
00239 SET_MATRIX(set_long_matrix, int64_t, DT_DENSE_LONG)
00240 SET_MATRIX(set_ulong_matrix, uint64_t, DT_DENSE_ULONG)
00241 SET_MATRIX(set_short_matrix, int16_t, DT_DENSE_SHORT)
00242 SET_MATRIX(set_word_matrix, uint16_t, DT_DENSE_WORD)
00243 SET_MATRIX(set_shortreal_matrix, float32_t, DT_DENSE_SHORTREAL)
00244 SET_MATRIX(set_real_matrix, float64_t, DT_DENSE_REAL)
00245 SET_MATRIX(set_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL)
00246 #undef SET_MATRIX
00247 
00248 #define SET_SPARSEMATRIX(fname, sg_type, dtype)             \
00249 void CBinaryFile::fname(const TSparse<sg_type>* matrix,     \
00250         int32_t num_feat, int32_t num_vec)                  \
00251 {                                                           \
00252     if (!(file && matrix))                                  \
00253         SG_ERROR("File or matrix invalid.\n");              \
00254                                                             \
00255     write_header(dtype);                                    \
00256                                                             \
00257     if (fwrite(&num_vec, sizeof(int32_t), 1, file)!=1)      \
00258         SG_ERROR("Failed to write Sparse Matrix\n");        \
00259                                                             \
00260     for (int32_t i=0; i<num_vec; i++)                       \
00261     {                                                       \
00262         TSparseEntry<sg_type>* vec = matrix[i].features;    \
00263         int32_t len=matrix[i].num_feat_entries;             \
00264         if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) ||  \
00265                 (fwrite(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len))     \
00266             SG_ERROR("Failed to write Sparse Matrix\n");    \
00267     }                                                       \
00268 }
00269 SET_SPARSEMATRIX(set_bool_sparsematrix, bool, DT_SPARSE_BOOL)
00270 SET_SPARSEMATRIX(set_char_sparsematrix, char, DT_SPARSE_CHAR)
00271 SET_SPARSEMATRIX(set_byte_sparsematrix, uint8_t, DT_SPARSE_BYTE)
00272 SET_SPARSEMATRIX(set_int_sparsematrix, int32_t, DT_SPARSE_INT)
00273 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, DT_SPARSE_UINT)
00274 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, DT_SPARSE_LONG)
00275 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG)
00276 SET_SPARSEMATRIX(set_short_sparsematrix, int16_t, DT_SPARSE_SHORT)
00277 SET_SPARSEMATRIX(set_word_sparsematrix, uint16_t, DT_SPARSE_WORD)
00278 SET_SPARSEMATRIX(set_shortreal_sparsematrix, float32_t, DT_SPARSE_SHORTREAL)
00279 SET_SPARSEMATRIX(set_real_sparsematrix, float64_t, DT_SPARSE_REAL)
00280 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL)
00281 #undef SET_SPARSEMATRIX
00282 
00283 #define SET_STRING_LIST(fname, sg_type, dtype) \
00284 void CBinaryFile::fname(const T_STRING<sg_type>* strings, int32_t num_str)  \
00285 {                                                                                       \
00286     if (!(file && strings))                                                             \
00287         SG_ERROR("File or strings invalid.\n");                                         \
00288                                                                                         \
00289     write_header(dtype);                                                                \
00290     for (int32_t i=0; i<num_str; i++)                                                   \
00291     {                                                                                   \
00292         int32_t len = strings[i].length;                                                \
00293         if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) ||                              \
00294                 (fwrite(strings[i].string, sizeof(sg_type), len, file)!= (size_t) len)) \
00295             SG_ERROR("Failed to write Sparse Matrix\n");                                \
00296     }                                                                                   \
00297 }
00298 SET_STRING_LIST(set_char_string_list, char, DT_STRING_CHAR)
00299 SET_STRING_LIST(set_byte_string_list, uint8_t, DT_STRING_BYTE)
00300 SET_STRING_LIST(set_int_string_list, int32_t, DT_STRING_INT)
00301 SET_STRING_LIST(set_uint_string_list, uint32_t, DT_STRING_UINT)
00302 SET_STRING_LIST(set_long_string_list, int64_t, DT_STRING_LONG)
00303 SET_STRING_LIST(set_ulong_string_list, uint64_t, DT_STRING_ULONG)
00304 SET_STRING_LIST(set_short_string_list, int16_t, DT_STRING_SHORT)
00305 SET_STRING_LIST(set_word_string_list, uint16_t, DT_STRING_WORD)
00306 SET_STRING_LIST(set_shortreal_string_list, float32_t, DT_STRING_SHORTREAL)
00307 SET_STRING_LIST(set_real_string_list, float64_t, DT_STRING_REAL)
00308 SET_STRING_LIST(set_longreal_string_list, floatmax_t, DT_STRING_LONGREAL)
00309 #undef SET_STRING_LIST
00310 
00311 
00312 int32_t CBinaryFile::parse_first_header(SGDataType &type)
00313 {
00314         return -1;
00315 }
00316 
00317 int32_t CBinaryFile::parse_next_header(SGDataType &type)
00318 {
00319         return -1;
00320 }
00321 
00322 
00323 SGDataType CBinaryFile::read_header()
00324 {
00325     ASSERT(file);
00326 
00327     char fourcc[4];
00328     uint16_t endian=0;
00329     uint16_t dtype=0;
00330 
00331     if (!((fread(&fourcc, sizeof(char), 4, file)==4) &&
00332             (fread(&endian, sizeof(uint16_t), 1, file)== 1) &&
00333             (fread(&dtype, sizeof(uint16_t), 1, file)== 1)))
00334         SG_ERROR("Error reading header\n");
00335 
00336     if (strncmp(fourcc, "SG00", 4))
00337         SG_ERROR("Header mismatch, expected SG00\n");
00338 
00339     return (SGDataType) dtype;
00340 }
00341 
00342 void CBinaryFile::write_header(SGDataType datatype)
00343 {
00344     ASSERT(file);
00345 
00346     const char* fourcc="SG00";
00347     uint16_t endian=0x1234;
00348     uint16_t dtype=datatype;
00349 
00350     if (!((fwrite(fourcc, sizeof(char), 4, file)==4) &&
00351             (fwrite(&endian, sizeof(uint16_t), 1, file)==1) &&
00352             (fwrite(&dtype, sizeof(uint16_t), 1, file)==1)))
00353         SG_ERROR("Error writing header\n");
00354 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation