00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/File.h"
00012 #include "features/SparseFeatures.h"
00013 #include "lib/BinaryFile.h"
00014
00015 using namespace shogun;
00016
00017 CBinaryFile::CBinaryFile(FILE* f, const char* name) : CFile(f, name)
00018 {
00019 }
00020
00021 CBinaryFile::CBinaryFile(char* fname, char rw, const char* name) : CFile(fname, rw, name)
00022 {
00023 }
00024
00025 CBinaryFile::~CBinaryFile()
00026 {
00027 }
00028
00029 #define GET_VECTOR(fname, sg_type, datatype) \
00030 void CBinaryFile::fname(sg_type*& vec, int32_t& len) \
00031 { \
00032 if (!file) \
00033 SG_ERROR("File invalid.\n"); \
00034 SGDataType dtype=read_header(); \
00035 if (dtype!=datatype) \
00036 SG_ERROR("Datatype mismatch\n"); \
00037 \
00038 if (fread(&len, sizeof(int32_t), 1, file)!=1) \
00039 SG_ERROR("Failed to read vector length\n"); \
00040 vec=new sg_type[len]; \
00041 if (fread(vec, sizeof(sg_type), len, file)!=(size_t) len) \
00042 SG_ERROR("Failed to read Matrix\n"); \
00043 }
00044
00045 GET_VECTOR(get_byte_vector, uint8_t, DT_VECTOR_BYTE)
00046 GET_VECTOR(get_char_vector, char, DT_VECTOR_CHAR)
00047 GET_VECTOR(get_int_vector, int32_t, DT_VECTOR_INT)
00048 GET_VECTOR(get_shortreal_vector, float32_t, DT_VECTOR_SHORTREAL)
00049 GET_VECTOR(get_real_vector, float64_t, DT_VECTOR_REAL)
00050 GET_VECTOR(get_short_vector, int16_t, DT_VECTOR_SHORT)
00051 GET_VECTOR(get_word_vector, uint16_t, DT_VECTOR_WORD)
00052 #undef GET_VECTOR
00053
00054 #define GET_MATRIX(fname, sg_type, datatype) \
00055 void CBinaryFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
00056 { \
00057 if (!file) \
00058 SG_ERROR("File invalid.\n"); \
00059 SGDataType dtype=read_header(); \
00060 if (dtype!=datatype) \
00061 SG_ERROR("Datatype mismatch\n"); \
00062 \
00063 if (fread(&num_feat, sizeof(int32_t), 1, file)!=1 || \
00064 fread(&num_vec, sizeof(int32_t), 1, file)!=1) \
00065 SG_ERROR("Failed to read Matrix dimensions\n"); \
00066 matrix=new sg_type[int64_t(num_feat)*num_vec]; \
00067 if (fread(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec) \
00068 SG_ERROR("Failed to read Matrix\n"); \
00069 }
00070
00071 GET_MATRIX(get_char_matrix, char, DT_DENSE_CHAR)
00072 GET_MATRIX(get_byte_matrix, uint8_t, DT_DENSE_BYTE)
00073 GET_MATRIX(get_int_matrix, int32_t, DT_DENSE_INT)
00074 GET_MATRIX(get_uint_matrix, uint32_t, DT_DENSE_UINT)
00075 GET_MATRIX(get_long_matrix, int64_t, DT_DENSE_LONG)
00076 GET_MATRIX(get_ulong_matrix, uint64_t, DT_DENSE_ULONG)
00077 GET_MATRIX(get_short_matrix, int16_t, DT_DENSE_SHORT)
00078 GET_MATRIX(get_word_matrix, uint16_t, DT_DENSE_WORD)
00079 GET_MATRIX(get_shortreal_matrix, float32_t, DT_DENSE_SHORTREAL)
00080 GET_MATRIX(get_real_matrix, float64_t, DT_DENSE_REAL)
00081 GET_MATRIX(get_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL)
00082 #undef GET_MATRIX
00083
00084 void CBinaryFile::get_byte_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims)
00085 {
00086 }
00087
00088 void CBinaryFile::get_char_ndarray(char*& array, int32_t*& dims, int32_t& num_dims)
00089 {
00090 }
00091
00092 void CBinaryFile::get_int_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims)
00093 {
00094 }
00095
00096 void CBinaryFile::get_shortreal_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims)
00097 {
00098 }
00099
00100 void CBinaryFile::get_real_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims)
00101 {
00102 }
00103
00104 void CBinaryFile::get_short_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims)
00105 {
00106 }
00107
00108 void CBinaryFile::get_word_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims)
00109 {
00110 }
00111
00112 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \
00113 void CBinaryFile::fname(TSparse<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
00114 { \
00115 if (!(file)) \
00116 SG_ERROR("File invalid.\n"); \
00117 \
00118 SGDataType dtype=read_header(); \
00119 if (dtype!=datatype) \
00120 SG_ERROR("Datatype mismatch\n"); \
00121 \
00122 if (fread(&num_vec, sizeof(int32_t), 1, file)!=1) \
00123 SG_ERROR("Failed to read number of vectors\n"); \
00124 \
00125 matrix=new TSparse<sg_type>[num_vec]; \
00126 \
00127 for (int32_t i=0; i<num_vec; i++) \
00128 { \
00129 int32_t len=0; \
00130 if (fread(&len, sizeof(int32_t), 1, file)!=1) \
00131 SG_ERROR("Failed to read sparse vector length of vector idx=%d\n", i); \
00132 matrix[i].num_feat_entries=len; \
00133 TSparseEntry<sg_type>* vec = new TSparseEntry<sg_type>[len]; \
00134 if (fread(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len) \
00135 SG_ERROR("Failed to read sparse vector %d\n", i); \
00136 matrix[i].features=vec; \
00137 } \
00138 }
00139 GET_SPARSEMATRIX(get_bool_sparsematrix, bool, DT_SPARSE_BOOL)
00140 GET_SPARSEMATRIX(get_char_sparsematrix, char, DT_SPARSE_CHAR)
00141 GET_SPARSEMATRIX(get_byte_sparsematrix, uint8_t, DT_SPARSE_BYTE)
00142 GET_SPARSEMATRIX(get_int_sparsematrix, int32_t, DT_SPARSE_INT)
00143 GET_SPARSEMATRIX(get_uint_sparsematrix, uint32_t, DT_SPARSE_UINT)
00144 GET_SPARSEMATRIX(get_long_sparsematrix, int64_t, DT_SPARSE_LONG)
00145 GET_SPARSEMATRIX(get_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG)
00146 GET_SPARSEMATRIX(get_short_sparsematrix, int16_t, DT_SPARSE_SHORT)
00147 GET_SPARSEMATRIX(get_word_sparsematrix, uint16_t, DT_SPARSE_WORD)
00148 GET_SPARSEMATRIX(get_shortreal_sparsematrix, float32_t, DT_SPARSE_SHORTREAL)
00149 GET_SPARSEMATRIX(get_real_sparsematrix, float64_t, DT_SPARSE_REAL)
00150 GET_SPARSEMATRIX(get_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL)
00151 #undef GET_SPARSEMATRIX
00152
00153
00154 #define GET_STRING_LIST(fname, sg_type, datatype) \
00155 void CBinaryFile::fname(T_STRING<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \
00156 { \
00157 strings=NULL; \
00158 num_str=0; \
00159 max_string_len=0; \
00160 \
00161 if (!file) \
00162 SG_ERROR("File invalid.\n"); \
00163 \
00164 SGDataType dtype=read_header(); \
00165 if (dtype!=datatype) \
00166 SG_ERROR("Datatype mismatch\n"); \
00167 \
00168 if (fread(&num_str, sizeof(int32_t), 1, file)!=1) \
00169 SG_ERROR("Failed to read number of strings\n"); \
00170 \
00171 strings=new T_STRING<sg_type>[num_str]; \
00172 \
00173 for (int32_t i=0; i<num_str; i++) \
00174 { \
00175 int32_t len=0; \
00176 if (fread(&len, sizeof(int32_t), 1, file)!=1) \
00177 SG_ERROR("Failed to read string length of string with idx=%d\n", i); \
00178 strings[i].length=len; \
00179 sg_type* str = new sg_type[len]; \
00180 if (fread(str, sizeof(sg_type), len, file)!= (size_t) len) \
00181 SG_ERROR("Failed to read string %d\n", i); \
00182 strings[i].string=str; \
00183 } \
00184 }
00185
00186 GET_STRING_LIST(get_char_string_list, char, DT_STRING_CHAR)
00187 GET_STRING_LIST(get_byte_string_list, uint8_t, DT_STRING_BYTE)
00188 GET_STRING_LIST(get_int_string_list, int32_t, DT_STRING_INT)
00189 GET_STRING_LIST(get_uint_string_list, uint32_t, DT_STRING_UINT)
00190 GET_STRING_LIST(get_long_string_list, int64_t, DT_STRING_LONG)
00191 GET_STRING_LIST(get_ulong_string_list, uint64_t, DT_STRING_ULONG)
00192 GET_STRING_LIST(get_short_string_list, int16_t, DT_STRING_SHORT)
00193 GET_STRING_LIST(get_word_string_list, uint16_t, DT_STRING_WORD)
00194 GET_STRING_LIST(get_shortreal_string_list, float32_t, DT_STRING_SHORTREAL)
00195 GET_STRING_LIST(get_real_string_list, float64_t, DT_STRING_REAL)
00196 GET_STRING_LIST(get_longreal_string_list, floatmax_t, DT_STRING_LONGREAL)
00197 #undef GET_STRING_LIST
00198
00201 #define SET_VECTOR(fname, sg_type, dtype) \
00202 void CBinaryFile::fname(const sg_type* vec, int32_t len) \
00203 { \
00204 if (!(file && vec)) \
00205 SG_ERROR("File or vector invalid.\n"); \
00206 \
00207 write_header(dtype); \
00208 \
00209 if (fwrite(&len, sizeof(int32_t), 1, file)!=1 || \
00210 fwrite(vec, sizeof(sg_type), len, file)!=(size_t) len) \
00211 SG_ERROR("Failed to write vector\n"); \
00212 }
00213 SET_VECTOR(set_byte_vector, uint8_t, DT_VECTOR_BYTE)
00214 SET_VECTOR(set_char_vector, char, DT_VECTOR_CHAR)
00215 SET_VECTOR(set_int_vector, int32_t, DT_VECTOR_INT)
00216 SET_VECTOR(set_shortreal_vector, float32_t, DT_VECTOR_SHORTREAL)
00217 SET_VECTOR(set_real_vector, float64_t, DT_VECTOR_REAL)
00218 SET_VECTOR(set_short_vector, int16_t, DT_VECTOR_SHORT)
00219 SET_VECTOR(set_word_vector, uint16_t, DT_VECTOR_WORD)
00220 #undef SET_VECTOR
00221
00222 #define SET_MATRIX(fname, sg_type, dtype) \
00223 void CBinaryFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
00224 { \
00225 if (!(file && matrix)) \
00226 SG_ERROR("File or matrix invalid.\n"); \
00227 \
00228 write_header(dtype); \
00229 \
00230 if (fwrite(&num_feat, sizeof(int32_t), 1, file)!=1 || \
00231 fwrite(&num_vec, sizeof(int32_t), 1, file)!=1 || \
00232 fwrite(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec) \
00233 SG_ERROR("Failed to write Matrix\n"); \
00234 }
00235 SET_MATRIX(set_char_matrix, char, DT_DENSE_CHAR)
00236 SET_MATRIX(set_byte_matrix, uint8_t, DT_DENSE_BYTE)
00237 SET_MATRIX(set_int_matrix, int32_t, DT_DENSE_INT)
00238 SET_MATRIX(set_uint_matrix, uint32_t, DT_DENSE_UINT)
00239 SET_MATRIX(set_long_matrix, int64_t, DT_DENSE_LONG)
00240 SET_MATRIX(set_ulong_matrix, uint64_t, DT_DENSE_ULONG)
00241 SET_MATRIX(set_short_matrix, int16_t, DT_DENSE_SHORT)
00242 SET_MATRIX(set_word_matrix, uint16_t, DT_DENSE_WORD)
00243 SET_MATRIX(set_shortreal_matrix, float32_t, DT_DENSE_SHORTREAL)
00244 SET_MATRIX(set_real_matrix, float64_t, DT_DENSE_REAL)
00245 SET_MATRIX(set_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL)
00246 #undef SET_MATRIX
00247
00248 #define SET_SPARSEMATRIX(fname, sg_type, dtype) \
00249 void CBinaryFile::fname(const TSparse<sg_type>* matrix, \
00250 int32_t num_feat, int32_t num_vec) \
00251 { \
00252 if (!(file && matrix)) \
00253 SG_ERROR("File or matrix invalid.\n"); \
00254 \
00255 write_header(dtype); \
00256 \
00257 if (fwrite(&num_vec, sizeof(int32_t), 1, file)!=1) \
00258 SG_ERROR("Failed to write Sparse Matrix\n"); \
00259 \
00260 for (int32_t i=0; i<num_vec; i++) \
00261 { \
00262 TSparseEntry<sg_type>* vec = matrix[i].features; \
00263 int32_t len=matrix[i].num_feat_entries; \
00264 if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) || \
00265 (fwrite(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len)) \
00266 SG_ERROR("Failed to write Sparse Matrix\n"); \
00267 } \
00268 }
00269 SET_SPARSEMATRIX(set_bool_sparsematrix, bool, DT_SPARSE_BOOL)
00270 SET_SPARSEMATRIX(set_char_sparsematrix, char, DT_SPARSE_CHAR)
00271 SET_SPARSEMATRIX(set_byte_sparsematrix, uint8_t, DT_SPARSE_BYTE)
00272 SET_SPARSEMATRIX(set_int_sparsematrix, int32_t, DT_SPARSE_INT)
00273 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, DT_SPARSE_UINT)
00274 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, DT_SPARSE_LONG)
00275 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG)
00276 SET_SPARSEMATRIX(set_short_sparsematrix, int16_t, DT_SPARSE_SHORT)
00277 SET_SPARSEMATRIX(set_word_sparsematrix, uint16_t, DT_SPARSE_WORD)
00278 SET_SPARSEMATRIX(set_shortreal_sparsematrix, float32_t, DT_SPARSE_SHORTREAL)
00279 SET_SPARSEMATRIX(set_real_sparsematrix, float64_t, DT_SPARSE_REAL)
00280 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL)
00281 #undef SET_SPARSEMATRIX
00282
00283 #define SET_STRING_LIST(fname, sg_type, dtype) \
00284 void CBinaryFile::fname(const T_STRING<sg_type>* strings, int32_t num_str) \
00285 { \
00286 if (!(file && strings)) \
00287 SG_ERROR("File or strings invalid.\n"); \
00288 \
00289 write_header(dtype); \
00290 for (int32_t i=0; i<num_str; i++) \
00291 { \
00292 int32_t len = strings[i].length; \
00293 if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) || \
00294 (fwrite(strings[i].string, sizeof(sg_type), len, file)!= (size_t) len)) \
00295 SG_ERROR("Failed to write Sparse Matrix\n"); \
00296 } \
00297 }
00298 SET_STRING_LIST(set_char_string_list, char, DT_STRING_CHAR)
00299 SET_STRING_LIST(set_byte_string_list, uint8_t, DT_STRING_BYTE)
00300 SET_STRING_LIST(set_int_string_list, int32_t, DT_STRING_INT)
00301 SET_STRING_LIST(set_uint_string_list, uint32_t, DT_STRING_UINT)
00302 SET_STRING_LIST(set_long_string_list, int64_t, DT_STRING_LONG)
00303 SET_STRING_LIST(set_ulong_string_list, uint64_t, DT_STRING_ULONG)
00304 SET_STRING_LIST(set_short_string_list, int16_t, DT_STRING_SHORT)
00305 SET_STRING_LIST(set_word_string_list, uint16_t, DT_STRING_WORD)
00306 SET_STRING_LIST(set_shortreal_string_list, float32_t, DT_STRING_SHORTREAL)
00307 SET_STRING_LIST(set_real_string_list, float64_t, DT_STRING_REAL)
00308 SET_STRING_LIST(set_longreal_string_list, floatmax_t, DT_STRING_LONGREAL)
00309 #undef SET_STRING_LIST
00310
00311
00312 int32_t CBinaryFile::parse_first_header(SGDataType &type)
00313 {
00314 return -1;
00315 }
00316
00317 int32_t CBinaryFile::parse_next_header(SGDataType &type)
00318 {
00319 return -1;
00320 }
00321
00322
00323 SGDataType CBinaryFile::read_header()
00324 {
00325 ASSERT(file);
00326
00327 char fourcc[4];
00328 uint16_t endian=0;
00329 uint16_t dtype=0;
00330
00331 if (!((fread(&fourcc, sizeof(char), 4, file)==4) &&
00332 (fread(&endian, sizeof(uint16_t), 1, file)== 1) &&
00333 (fread(&dtype, sizeof(uint16_t), 1, file)== 1)))
00334 SG_ERROR("Error reading header\n");
00335
00336 if (strncmp(fourcc, "SG00", 4))
00337 SG_ERROR("Header mismatch, expected SG00\n");
00338
00339 return (SGDataType) dtype;
00340 }
00341
00342 void CBinaryFile::write_header(SGDataType datatype)
00343 {
00344 ASSERT(file);
00345
00346 const char* fourcc="SG00";
00347 uint16_t endian=0x1234;
00348 uint16_t dtype=datatype;
00349
00350 if (!((fwrite(fourcc, sizeof(char), 4, file)==4) &&
00351 (fwrite(&endian, sizeof(uint16_t), 1, file)==1) &&
00352 (fwrite(&dtype, sizeof(uint16_t), 1, file)==1)))
00353 SG_ERROR("Error writing header\n");
00354 }