00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Cache.h"
00018 #include "lib/DynamicArray.h"
00019 #include "lib/File.h"
00020 #include "lib/MemoryMappedFile.h"
00021 #include "lib/Mathematics.h"
00022 #include "lib/Compressor.h"
00023
00024 #include "preproc/PreProc.h"
00025 #include "preproc/StringPreProc.h"
00026 #include "features/Features.h"
00027 #include "features/Alphabet.h"
00028
00029 #include <sys/types.h>
00030 #include <sys/stat.h>
00031 #include <dirent.h>
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <unistd.h>
00035
00036 namespace shogun
00037 {
00038 class CCompressor;
00039 enum E_COMPRESSION_TYPE;
00040 class CAlphabet;
00041 enum EAlphabet;
00042 template <class T> class CDynamicArray;
00043 class CFile;
00044 template <class T> class CMemoryMappedFile;
00045 class CMath;
00046 template <class ST> class CStringPreProc;
00047 template <class T> class T_STRING;
00048
00049 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00050
00051 template <class T> class T_STRING
00052 {
00053
00054
00055 #ifdef HAVE_BOOST_SERIALIZATION
00056
00057 private:
00058
00059
00060 friend class ::boost::serialization::access;
00061 template<class Archive>
00062 void save(Archive & ar, const unsigned int archive_version) const
00063 {
00064
00065
00066
00067 ar & length;
00068
00069 for (int i=0; i < length; ++i) {
00070 ar & string[i];
00071 }
00072
00073
00074
00075 }
00076
00077 template<class Archive>
00078 void load(Archive & ar, const unsigned int archive_version)
00079 {
00080
00081
00082
00083 ar & length;
00084
00085 string = new T[length];
00086
00087 for (int i=0; i < length; ++i) {
00088 ar & string[i];
00089 }
00090
00091
00092
00093 }
00094
00095 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00096
00097
00098 #endif //HAVE_BOOST_SERIALIZATION
00099
00100 public:
00102 T* string;
00104 int32_t length;
00105 };
00106 #endif // DOXYGEN_SHOULD_SKIP_THIS
00107
00108
00127 template <class ST> class CStringFeatures : public CFeatures
00128 {
00129 public:
00133 CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0),
00134 features(NULL), single_string(NULL),length_of_single_string(0),
00135 max_string_length(0), order(0), symbol_mask_table(NULL),
00136 preprocess_on_get(false), feature_cache(NULL)
00137 {
00138 }
00139
00144 CStringFeatures(EAlphabet alpha)
00145 : CFeatures(0), num_vectors(0), features(NULL),
00146 single_string(NULL),length_of_single_string(0),
00147 max_string_length(0), order(0), symbol_mask_table(NULL),
00148 preprocess_on_get(false), feature_cache(NULL)
00149 {
00150 alphabet=new CAlphabet(alpha);
00151 SG_REF(alphabet);
00152 num_symbols=alphabet->get_num_symbols();
00153 original_num_symbols=num_symbols;
00154 }
00155
00163 CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00164 int32_t p_max_string_length, EAlphabet alpha)
00165 : CFeatures(0), num_vectors(0), features(NULL),
00166 single_string(NULL),length_of_single_string(0),
00167 max_string_length(0), order(0), symbol_mask_table(NULL),
00168 preprocess_on_get(false), feature_cache(NULL)
00169 {
00170 alphabet=new CAlphabet(alpha);
00171 SG_REF(alphabet);
00172 num_symbols=alphabet->get_num_symbols();
00173 original_num_symbols=num_symbols;
00174 set_features(p_features, p_num_vectors, p_max_string_length);
00175 }
00176
00184 CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00185 int32_t p_max_string_length, CAlphabet* alpha)
00186 : CFeatures(0), num_vectors(0), features(NULL),
00187 single_string(NULL),length_of_single_string(0),
00188 max_string_length(0), order(0), symbol_mask_table(NULL),
00189 preprocess_on_get(false), feature_cache(NULL)
00190 {
00191 alphabet=new CAlphabet(alpha);
00192 SG_REF(alphabet);
00193 num_symbols=alphabet->get_num_symbols();
00194 original_num_symbols=num_symbols;
00195 set_features(p_features, p_num_vectors, p_max_string_length);
00196 }
00197
00202 CStringFeatures(CAlphabet* alpha)
00203 : CFeatures(0), num_vectors(0), features(NULL),
00204 single_string(NULL),length_of_single_string(0),
00205 max_string_length(0), order(0), symbol_mask_table(NULL),
00206 preprocess_on_get(false), feature_cache(NULL)
00207 {
00208 ASSERT(alpha);
00209 SG_REF(alpha);
00210 alphabet=alpha;
00211 num_symbols=alphabet->get_num_symbols();
00212 original_num_symbols=num_symbols;
00213 }
00214
00216 CStringFeatures(const CStringFeatures & orig)
00217 : CFeatures(orig), num_vectors(orig.num_vectors),
00218 single_string(orig.single_string),
00219 length_of_single_string(orig.length_of_single_string),
00220 max_string_length(orig.max_string_length),
00221 num_symbols(orig.num_symbols),
00222 original_num_symbols(orig.original_num_symbols),
00223 order(orig.order), preprocess_on_get(false),
00224 feature_cache(NULL)
00225 {
00226 ASSERT(orig.single_string == NULL);
00227
00228 alphabet=orig.alphabet;
00229 SG_REF(alphabet);
00230
00231 if (orig.features)
00232 {
00233 features=new T_STRING<ST>[orig.num_vectors];
00234
00235 for (int32_t i=0; i<num_vectors; i++)
00236 {
00237 features[i].string=new ST[orig.features[i].length];
00238 features[i].length=orig.features[i].length;
00239 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00240 }
00241 }
00242
00243 if (orig.symbol_mask_table)
00244 {
00245 symbol_mask_table=new ST[256];
00246 for (int32_t i=0; i<256; i++)
00247 symbol_mask_table[i]=orig.symbol_mask_table[i];
00248 }
00249 }
00250
00256 CStringFeatures(CFile* loader, EAlphabet alpha=DNA)
00257 : CFeatures(loader), num_vectors(0), features(NULL), single_string(NULL),
00258 length_of_single_string(0), max_string_length(0), order(0),
00259 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00260 {
00261 alphabet=new CAlphabet(alpha);
00262 SG_REF(alphabet);
00263 num_symbols=alphabet->get_num_symbols();
00264 original_num_symbols=num_symbols;
00265 load(loader);
00266 }
00267
00268 virtual ~CStringFeatures()
00269 {
00270 cleanup();
00271
00272 SG_UNREF(alphabet);
00273 }
00274
00276 virtual void cleanup()
00277 {
00278 if (single_string)
00279 {
00280 delete[] single_string;
00281 single_string=NULL;
00282 }
00283 else
00284 {
00285 for (int32_t i=0; i<num_vectors; i++)
00286 cleanup_feature_vector(i);
00287 }
00288
00289 num_vectors=0;
00290 delete[] features;
00291 delete[] symbol_mask_table;
00292 features=NULL;
00293 symbol_mask_table=NULL;
00294
00295
00296
00297
00298
00299 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00300 SG_UNREF(alphabet);
00301 alphabet=alpha;
00302 SG_REF(alphabet);
00303 }
00304
00306 virtual void cleanup_feature_vector(int32_t num)
00307 {
00308 ASSERT(num<num_vectors);
00309 if (features)
00310 {
00311 delete[] features[num].string;
00312 features[num].string=NULL;
00313 features[num].length=0;
00314 }
00315 }
00316
00321 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00322
00327 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00328
00333 inline CAlphabet* get_alphabet()
00334 {
00335 SG_REF(alphabet);
00336 return alphabet;
00337 }
00338
00343 virtual CFeatures* duplicate() const
00344 {
00345 return new CStringFeatures<ST>(*this);
00346 }
00347
00354 void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00355 {
00356 ASSERT(features);
00357 if (num>=num_vectors)
00358 {
00359 SG_ERROR("Index out of bounds (number of strings %d, you "
00360 "requested %d)\n", num_vectors, num);
00361 }
00362
00363 int32_t l;
00364 bool free_vec;
00365 ST* vec=get_feature_vector(num, l, free_vec);
00366 *len=l;
00367 *dst=(ST*) malloc(*len * sizeof(ST));
00368 ASSERT(*dst);
00369 memcpy(*dst, vec, *len * sizeof(ST));
00370 free_feature_vector(vec, num, free_vec);
00371 }
00372
00379 void set_feature_vector(ST* src, int32_t len, int32_t num)
00380 {
00381 ASSERT(features);
00382 if (num>=num_vectors)
00383 {
00384 SG_ERROR("Index out of bounds (number of strings %d, you "
00385 "requested %d)\n", num_vectors, num);
00386 }
00387
00388 if (len<=0)
00389 SG_ERROR("String has zero or negative length\n");
00390
00391
00392 cleanup_feature_vector(num);
00393 features[num].length=len;
00394 features[num].string=new ST[len];
00395 memcpy(features[num].string, src, len*sizeof(ST));
00396
00397 determine_maximum_string_length();
00398 }
00399
00402 void enable_on_the_fly_preprocessing()
00403 {
00404 preprocess_on_get=true;
00405 }
00406
00410 void disable_on_the_fly_preprocessing()
00411 {
00412 preprocess_on_get=false;
00413 }
00414
00423 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00424 {
00425 ASSERT(features);
00426 ASSERT(num<num_vectors);
00427
00428 if (!preprocess_on_get)
00429 {
00430 dofree=false;
00431 len=features[num].length;
00432 return features[num].string;
00433 }
00434 else
00435 {
00436 SG_DEBUG( "computing feature vector!\n") ;
00437 ST* feat=compute_feature_vector(num, len);
00438 dofree=true;
00439
00440 if (get_num_preproc())
00441 {
00442 ST* tmp_feat_before = feat;
00443
00444 for (int32_t i=0; i<get_num_preproc(); i++)
00445 {
00446 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00447 feat=p->apply_to_string(tmp_feat_before, len);
00448 SG_UNREF(p);
00449 delete[] tmp_feat_before;
00450 tmp_feat_before=feat;
00451 }
00452 }
00453
00454 return feat;
00455 }
00456 }
00457
00462 CStringFeatures<ST>* get_transposed()
00463 {
00464 int32_t num_feat;
00465 int32_t num_vec;
00466 T_STRING<ST>* s=get_transposed(num_feat, num_vec);
00467
00468 return new CStringFeatures<ST>(s, num_vec, num_feat, alphabet);
00469 }
00470
00482 T_STRING<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00483 {
00484 num_feat=num_vectors;
00485 num_vec=get_max_vector_length();
00486 ASSERT(have_same_length());
00487
00488 SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00489 int64_t(num_feat)*num_vec);
00490
00491 T_STRING<ST>* sf=new T_STRING<ST>[num_vec];
00492
00493 for (int32_t i=0; i<num_vec; i++)
00494 {
00495 sf[i].string=new ST[num_feat];
00496 sf[i].length=num_feat;
00497 }
00498
00499 for (int32_t i=0; i<num_feat; i++)
00500 {
00501 int32_t len=0;
00502 bool free_vec=false;
00503 ST* vec=get_feature_vector(i, len, free_vec);
00504
00505 for (int32_t j=0; j<num_vec; j++)
00506 sf[j].string[i]=vec[j];
00507
00508 free_feature_vector(vec, i, free_vec);
00509 }
00510 return sf;
00511 }
00512
00519 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00520 {
00521 if (feature_cache)
00522 feature_cache->unlock_entry(num);
00523
00524 if (dofree)
00525 delete[] feat_vec ;
00526 }
00527
00534 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00535 {
00536 int32_t len;
00537 bool free_vec;
00538 ST* vec=get_feature_vector(vec_num, len, free_vec);
00539 ASSERT(feat_num<len);
00540 ST result=vec[feat_num];
00541 free_feature_vector(vec, vec_num, free_vec);
00542
00543 return result;
00544 }
00545
00551 virtual inline int32_t get_vector_length(int32_t vec_num)
00552 {
00553 int32_t len;
00554 bool free_vec;
00555 ST* vec=get_feature_vector(vec_num, len, free_vec);
00556 free_feature_vector(vec, vec_num, free_vec);
00557 return len;
00558 }
00559
00564 virtual inline int32_t get_max_vector_length()
00565 {
00566 return max_string_length;
00567 }
00568
00573 virtual inline int32_t get_num_vectors() { return num_vectors; }
00574
00581 inline floatmax_t get_num_symbols() { return num_symbols; }
00582
00590 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00591
00592
00593
00598 inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00599
00604 inline int32_t get_order() { return order; }
00605
00613 inline ST get_masked_symbols(ST symbol, uint8_t mask)
00614 {
00615 ASSERT(symbol_mask_table);
00616 return symbol_mask_table[mask] & symbol;
00617 }
00618
00625 inline ST shift_offset(ST offset, int32_t amount)
00626 {
00627 ASSERT(alphabet);
00628 return (offset << (amount*alphabet->get_num_bits()));
00629 }
00630
00637 inline ST shift_symbol(ST symbol, int32_t amount)
00638 {
00639 ASSERT(alphabet);
00640 return (symbol >> (amount*alphabet->get_num_bits()));
00641 }
00642
00647 virtual inline void load(CFile* loader);
00648
00657 void load_ascii_file(char* fname, bool remap_to_bin=true,
00658 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00659 {
00660 size_t blocksize=1024*1024;
00661 size_t required_blocksize=0;
00662 uint8_t* dummy=new uint8_t[blocksize];
00663 uint8_t* overflow=NULL;
00664 int32_t overflow_len=0;
00665
00666 cleanup();
00667
00668 CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00669 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00670
00671 FILE* f=fopen(fname, "ro");
00672
00673 if (f)
00674 {
00675 num_vectors=0;
00676 max_string_length=0;
00677
00678 SG_INFO("counting line numbers in file %s\n", fname);
00679 size_t block_offs=0;
00680 size_t old_block_offs=0;
00681 fseek(f, 0, SEEK_END);
00682 size_t fsize=ftell(f);
00683 rewind(f);
00684
00685 if (blocksize>fsize)
00686 blocksize=fsize;
00687
00688 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00689
00690 size_t sz=blocksize;
00691 while (sz == blocksize)
00692 {
00693 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00694 bool contains_cr=false;
00695 for (size_t i=0; i<sz; i++)
00696 {
00697 block_offs++;
00698 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00699 {
00700 num_vectors++;
00701 contains_cr=true;
00702 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00703 old_block_offs=block_offs;
00704 }
00705 }
00706 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00707 }
00708
00709 SG_INFO("found %d strings\n", num_vectors);
00710 delete[] dummy;
00711 blocksize=required_blocksize;
00712 dummy = new uint8_t[blocksize];
00713 overflow = new uint8_t[blocksize];
00714 features=new T_STRING<ST>[num_vectors];
00715
00716 rewind(f);
00717 sz=blocksize;
00718 int32_t lines=0;
00719 while (sz == blocksize)
00720 {
00721 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00722
00723 size_t old_sz=0;
00724 for (size_t i=0; i<sz; i++)
00725 {
00726 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00727 {
00728 int32_t len=i-old_sz;
00729
00730 max_string_length=CMath::max(max_string_length, len+overflow_len);
00731
00732 features[lines].length=len;
00733 features[lines].string=new ST[len];
00734
00735 if (remap_to_bin)
00736 {
00737 for (int32_t j=0; j<overflow_len; j++)
00738 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00739 for (int32_t j=0; j<len; j++)
00740 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00741 alpha->add_string_to_histogram(&dummy[old_sz], len);
00742 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00743 }
00744 else
00745 {
00746 for (int32_t j=0; j<overflow_len; j++)
00747 features[lines].string[j]=overflow[j];
00748 for (int32_t j=0; j<len; j++)
00749 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00750 alpha->add_string_to_histogram(&dummy[old_sz], len);
00751 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00752 }
00753
00754
00755 overflow_len=0;
00756
00757
00758 old_sz=i+1;
00759 lines++;
00760 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00761 }
00762 }
00763 for (size_t i=old_sz; i<sz; i++)
00764 overflow[i-old_sz]=dummy[i];
00765
00766 overflow_len=sz-old_sz;
00767 }
00768
00769 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00770 {
00771 SG_INFO("file successfully read\n");
00772 SG_INFO("max_string_length=%d\n", max_string_length);
00773 SG_INFO("num_strings=%d\n", num_vectors);
00774 }
00775 fclose(f);
00776 }
00777
00778 delete[] dummy;
00779
00780 SG_UNREF(alphabet);
00781
00782 if (remap_to_bin)
00783 alphabet = alpha_bin;
00784 else
00785 alphabet = alpha;
00786 SG_REF(alphabet);
00787 num_symbols=alphabet->get_num_symbols();
00788 }
00789
00796 bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00797 {
00798 int32_t i=0;
00799 uint64_t len=0;
00800 uint64_t offs=0;
00801 int32_t num=0;
00802 int32_t max_len=0;
00803
00804 CMemoryMappedFile<char> f(fname);
00805
00806 while (true)
00807 {
00808 char* s=f.get_line(len, offs);
00809 if (!s)
00810 break;
00811
00812 if (len>0 && s[0]=='>')
00813 num++;
00814 }
00815
00816 if (num==0)
00817 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00818
00819 cleanup();
00820 SG_UNREF(alphabet);
00821 alphabet=new CAlphabet(DNA);
00822 num_symbols=alphabet->get_num_symbols();
00823
00824 T_STRING<ST>* strings=new T_STRING<ST>[num];
00825 offs=0;
00826
00827 for (i=0;i<num; i++)
00828 {
00829 uint64_t id_len=0;
00830 char* id=f.get_line(id_len, offs);
00831
00832 char* fasta=f.get_line(len, offs);
00833 char* s=fasta;
00834 int32_t fasta_len=0;
00835 int32_t spanned_lines=0;
00836
00837 while (true)
00838 {
00839 if (!s || len==0)
00840 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00841
00842 if (s[0]=='>' || offs==f.get_size())
00843 {
00844 offs-=len+1;
00845 if (offs==f.get_size())
00846 {
00847 SG_DEBUG("at EOF\n");
00848 fasta_len+=len;
00849 }
00850
00851 len = fasta_len-spanned_lines;
00852 strings[i].string=new ST[len];
00853 strings[i].length=len;
00854
00855 ST* str=strings[i].string;
00856 int32_t idx=0;
00857 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00858
00859 for (int32_t j=0; j<fasta_len; j++)
00860 {
00861 if (fasta[j]=='\n')
00862 continue;
00863
00864 ST c = (ST) fasta[j];
00865
00866 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00867 c = (ST) 'A';
00868
00869 if (idx>=len)
00870 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00871 str[idx++]=c;
00872 }
00873 max_len=CMath::max(max_len, strings[i].length);
00874
00875
00876 break;
00877 }
00878
00879 spanned_lines++;
00880 fasta_len+=len+1;
00881 s=f.get_line(len, offs);
00882 }
00883 }
00884
00885 return set_features(strings, num, max_len);
00886 }
00887
00895 bool load_fastq_file(const char* fname,
00896 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00897 {
00898 CMemoryMappedFile<char> f(fname);
00899
00900 int32_t i=0;
00901 uint64_t len=0;
00902 uint64_t offs=0;
00903
00904 int32_t num=f.get_num_lines();
00905 int32_t max_len=0;
00906
00907 if (num%4)
00908 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00909 num/=4;
00910
00911 cleanup();
00912 SG_UNREF(alphabet);
00913 alphabet=new CAlphabet(DNA);
00914
00915 T_STRING<ST>* strings;
00916
00917 ST* str;
00918 if (bitremap_in_single_string)
00919 {
00920 strings=new T_STRING<ST>[1];
00921 strings[0].string=new ST[num];
00922 strings[0].length=num;
00923 f.get_line(len, offs);
00924 f.get_line(len, offs);
00925 order=len;
00926 max_len=num;
00927 offs=0;
00928 original_num_symbols=alphabet->get_num_symbols();
00929 int32_t max_val=alphabet->get_num_bits();
00930 str=new ST[len];
00931 }
00932 else
00933 strings=new T_STRING<ST>[num];
00934
00935 for (i=0;i<num; i++)
00936 {
00937 if (!f.get_line(len, offs))
00938 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00939
00940 char* s=f.get_line(len, offs);
00941 if (!s || len==0)
00942 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00943
00944 if (bitremap_in_single_string)
00945 {
00946 if (len!=order)
00947 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00948 for (int32_t j=0; j<order; j++)
00949 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00950
00951 strings[0].string[i]=embed_word(str, order);
00952 }
00953 else
00954 {
00955 strings[i].string=new ST[len];
00956 strings[i].length=len;
00957 str=strings[i].string;
00958
00959 if (ignore_invalid)
00960 {
00961 for (int32_t j=0; j<len; j++)
00962 {
00963 if (alphabet->is_valid((uint8_t) s[j]))
00964 str[j]= (ST) s[j];
00965 else
00966 str[j]= (ST) 'A';
00967 }
00968 }
00969 else
00970 {
00971 for (int32_t j=0; j<len; j++)
00972 str[j]= (ST) s[j];
00973 }
00974 max_len=CMath::max(max_len, (int32_t) len);
00975 }
00976
00977
00978 if (!f.get_line(len, offs))
00979 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00980
00981 if (!f.get_line(len, offs))
00982 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00983 }
00984
00985 if (bitremap_in_single_string)
00986 num=1;
00987
00988 num_vectors=num;
00989 max_string_length=max_len;
00990 features=strings;
00991
00992 return true;
00993 }
00994
01000 bool load_from_directory(char* dirname)
01001 {
01002 struct dirent **namelist;
01003 int32_t n;
01004
01005 CIO::set_dirname(dirname);
01006
01007 SG_DEBUG("dirname '%s'\n", dirname);
01008
01009 n = scandir(dirname, &namelist, &CIO::filter, alphasort);
01010 if (n <= 0)
01011 {
01012 SG_ERROR("error calling scandir - no files found\n");
01013 return false;
01014 }
01015 else
01016 {
01017 T_STRING<ST>* strings=NULL;
01018
01019 int32_t num=0;
01020 int32_t max_len=-1;
01021
01022
01023
01024 strings=new T_STRING<ST>[n];
01025
01026 for (int32_t i=0; i<n; i++)
01027 {
01028 char* fname=CIO::concat_filename(namelist[i]->d_name);
01029
01030 struct stat s;
01031 off_t filesize=0;
01032
01033 if (!stat(fname, &s) && s.st_size>0)
01034 {
01035 filesize=s.st_size/sizeof(ST);
01036
01037 FILE* f=fopen(fname, "ro");
01038 if (f)
01039 {
01040 ST* str=new ST[filesize];
01041 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01042 fread(str, sizeof(ST), filesize, f);
01043 strings[num].string=str;
01044 strings[num].length=filesize;
01045 max_len=CMath::max(max_len, strings[num].length);
01046
01047 num++;
01048 fclose(f);
01049 }
01050 }
01051 else
01052 SG_ERROR("empty or non readable file \'%s\'\n", fname);
01053
01054 free(namelist[i]);
01055 }
01056 free(namelist);
01057
01058 if (num>0 && strings)
01059 {
01060 set_features(strings, num, max_len);
01061 return true;
01062 }
01063 }
01064 return false;
01065 }
01066
01074 bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01075 {
01076 if (p_features)
01077 {
01078 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01079
01080
01081 for (int32_t i=0; i<p_num_vectors; i++)
01082 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01083
01084 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01085 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01086
01087 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01088 {
01089 cleanup();
01090 SG_UNREF(alphabet);
01091
01092 alphabet=alpha;
01093 SG_REF(alphabet);
01094
01095 this->features=p_features;
01096 this->num_vectors=p_num_vectors;
01097 this->max_string_length=p_max_string_length;
01098
01099 return true;
01100 }
01101 else
01102 SG_UNREF(alpha);
01103 }
01104
01105 return false;
01106 }
01107
01113 bool append_features(CStringFeatures<ST>* sf)
01114 {
01115 ASSERT(sf);
01116 T_STRING<ST>* new_features = new T_STRING<ST>[sf->num_vectors];
01117
01118 for (int32_t i=0; i<sf->num_vectors; i++)
01119 {
01120 int32_t length=sf->features[i].length;
01121 new_features[i].string=new ST[length];
01122 memcpy(new_features[i].string, sf->features[i].string, length);
01123 new_features[i].length=length;
01124 }
01125 return append_features(new_features, sf->num_vectors,
01126 sf->max_string_length);
01127 }
01128
01139 bool append_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01140 {
01141 if (!features)
01142 return set_features(p_features, p_num_vectors, p_max_string_length);
01143
01144 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01145
01146
01147 for (int32_t i=0; i<p_num_vectors; i++)
01148 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01149
01150 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01151 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01152
01153 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01154 {
01155 SG_UNREF(alpha);
01156 for (int32_t i=0; i<p_num_vectors; i++)
01157 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].length);
01158
01159 int32_t old_num_vectors=num_vectors;
01160 num_vectors=old_num_vectors+p_num_vectors;
01161 T_STRING<ST>* new_features = new T_STRING<ST>[num_vectors];
01162
01163 for (int32_t i=0; i<num_vectors; i++)
01164 {
01165 if (i<old_num_vectors)
01166 {
01167 new_features[i].string=features[i].string;
01168 new_features[i].length=features[i].length;
01169 }
01170 else
01171 {
01172 new_features[i].string=p_features[i-old_num_vectors].string;
01173 new_features[i].length=p_features[i-old_num_vectors].length;
01174 }
01175 }
01176 delete[] features;
01177 delete[] p_features;
01178
01179 this->features=new_features;
01180 this->max_string_length=CMath::max(max_string_length, p_max_string_length);
01181
01182 return true;
01183 }
01184 SG_UNREF(alpha);
01185
01186 return false;
01187 }
01188
01195 virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01196 {
01197 num_str=num_vectors;
01198 max_str_len=max_string_length;
01199 return features;
01200 }
01201
01208 virtual T_STRING<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01209 {
01210 ASSERT(num_vectors>0);
01211
01212 num_str=num_vectors;
01213 max_str_len=max_string_length;
01214 T_STRING<ST>* new_feat=new T_STRING<ST>[num_str];
01215
01216 for (int32_t i=0; i<num_str; i++)
01217 {
01218 int32_t len;
01219 bool free_vec;
01220 ST* vec=get_feature_vector(i, len, free_vec);
01221 new_feat[i].string=new ST[len];
01222 new_feat[i].length=len;
01223 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01224 free_feature_vector(vec, i, free_vec);
01225 }
01226
01227 return new_feat;
01228 }
01229
01235 virtual void get_features(T_STRING<ST>** dst, int32_t* num_str)
01236 {
01237 int32_t num_vec;
01238 int32_t max_str_len;
01239 *dst=copy_features(num_vec, max_str_len);
01240 *num_str=num_vec;
01241 }
01242
01247 virtual inline void save(CFile* writer);
01248
01255 virtual bool load_compressed(char* src, bool decompress)
01256 {
01257 FILE* file=NULL;
01258
01259 if (!(file=fopen(src, "r")))
01260 return false;
01261 cleanup();
01262
01263
01264 char id[4];
01265 fread(&id[0], sizeof(char), 1, file);
01266 ASSERT(id[0]=='S');
01267 fread(&id[1], sizeof(char), 1, file);
01268 ASSERT(id[1]=='G');
01269 fread(&id[2], sizeof(char), 1, file);
01270 ASSERT(id[2]=='V');
01271 fread(&id[3], sizeof(char), 1, file);
01272 ASSERT(id[3]=='0');
01273
01274
01275 uint8_t c;
01276 fread(&c, sizeof(uint8_t), 1, file);
01277 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01278
01279 uint8_t a;
01280 delete alphabet;
01281 fread(&a, sizeof(uint8_t), 1, file);
01282 alphabet=new CAlphabet((EAlphabet) a);
01283
01284 fread(&num_vectors, sizeof(int32_t), 1, file);
01285 ASSERT(num_vectors>0);
01286
01287 fread(&max_string_length, sizeof(int32_t), 1, file);
01288 ASSERT(max_string_length>0);
01289
01290 features=new T_STRING<ST>[num_vectors];
01291
01292
01293 for (int32_t i=0; i<num_vectors; i++)
01294 {
01295
01296 int32_t len_compressed;
01297 fread(&len_compressed, sizeof(int32_t), 1, file);
01298
01299 int32_t len_uncompressed;
01300 fread(&len_uncompressed, sizeof(int32_t), 1, file);
01301
01302
01303 if (decompress)
01304 {
01305 features[i].string=new ST[len_uncompressed];
01306 features[i].length=len_uncompressed;
01307 uint8_t* compressed=new uint8_t[len_compressed];
01308 fread(compressed, len_compressed, 1, file);
01309 uint64_t uncompressed_size=len_uncompressed;
01310 uncompressed_size*=sizeof(ST);
01311 compressor->decompress(compressed, len_compressed,
01312 (uint8_t*) features[i].string, uncompressed_size);
01313 delete[] compressed;
01314 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01315 }
01316 else
01317 {
01318 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01319 features[i].string=new ST[len_compressed+offs];
01320 features[i].length=len_compressed+offs;
01321 int32_t* feat32ptr=((int32_t*) (features[i].string));
01322 memset(features[i].string, 0, offs*sizeof(ST));
01323 feat32ptr[0]=(int32_t) len_compressed;
01324 feat32ptr[1]=(int32_t) len_uncompressed;
01325 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01326 fread(compressed, len_compressed, 1, file);
01327 }
01328 }
01329
01330 delete compressor;
01331 fclose(file);
01332 return false;
01333 }
01334
01342 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01343 {
01344 FILE* file=NULL;
01345
01346 if (!(file=fopen(dest, "wb")))
01347 return false;
01348
01349 CCompressor* compressor= new CCompressor(compression);
01350
01351
01352 const char* id="SGV0";
01353 fwrite(&id[0], sizeof(char), 1, file);
01354 fwrite(&id[1], sizeof(char), 1, file);
01355 fwrite(&id[2], sizeof(char), 1, file);
01356 fwrite(&id[3], sizeof(char), 1, file);
01357
01358
01359 uint8_t c=(uint8_t) compression;
01360 fwrite(&c, sizeof(uint8_t), 1, file);
01361
01362 uint8_t a=(uint8_t) alphabet->get_alphabet();
01363 fwrite(&a, sizeof(uint8_t), 1, file);
01364
01365 fwrite(&num_vectors, sizeof(int32_t), 1, file);
01366
01367 fwrite(&max_string_length, sizeof(int32_t), 1, file);
01368
01369
01370 for (int32_t i=0; i<num_vectors; i++)
01371 {
01372 int32_t len=-1;
01373 bool vfree;
01374 ST* vec=get_feature_vector(i, len, vfree);
01375
01376 uint8_t* compressed=NULL;
01377 uint64_t compressed_size=0;
01378
01379 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01380 compressed, compressed_size, level);
01381
01382 int32_t len_compressed = (int32_t) compressed_size;
01383
01384 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01385
01386 fwrite(&len, sizeof(int32_t), 1, file);
01387
01388 fwrite(compressed, compressed_size, 1, file);
01389 delete[] compressed;
01390
01391 free_feature_vector(vec, i, vfree);
01392 }
01393
01394 delete compressor;
01395 fclose(file);
01396 return true;
01397 }
01398
01399
01404 virtual int32_t get_size() { return sizeof(ST); }
01405
01411 virtual bool apply_preproc(bool force_preprocessing=false)
01412 {
01413 SG_DEBUG( "force: %d\n", force_preprocessing);
01414
01415 for (int32_t i=0; i<get_num_preproc(); i++)
01416 {
01417 if ( (!is_preprocessed(i) || force_preprocessing) )
01418 {
01419 set_preprocessed(i);
01420 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
01421 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01422
01423 if (!p->apply_to_string_features(this))
01424 {
01425 SG_UNREF(p);
01426 return false;
01427 }
01428 else
01429 SG_UNREF(p);
01430 }
01431 }
01432 return true;
01433 }
01434
01444 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01445 {
01446 ASSERT(step_size>0);
01447 ASSERT(window_size>0);
01448 ASSERT(num_vectors==1 || single_string);
01449 ASSERT(max_string_length>=window_size ||
01450 (single_string && length_of_single_string>=window_size));
01451
01452
01453
01454 if (single_string)
01455 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01456 else if (num_vectors==1)
01457 {
01458 num_vectors= (max_string_length-window_size)/step_size + 1;
01459 length_of_single_string=max_string_length;
01460 }
01461
01462 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01463 int32_t offs=0;
01464 for (int32_t i=0; i<num_vectors; i++)
01465 {
01466 f[i].string=&features[0].string[offs+skip];
01467 f[i].length=window_size-skip;
01468 offs+=step_size;
01469 }
01470 single_string=features[0].string;
01471 delete[] features;
01472 features=f;
01473 max_string_length=window_size-skip;
01474
01475 return num_vectors;
01476 }
01477
01486 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01487 {
01488 ASSERT(positions);
01489 ASSERT(window_size>0);
01490 ASSERT(num_vectors==1 || single_string);
01491 ASSERT(max_string_length>=window_size ||
01492 (single_string && length_of_single_string>=window_size));
01493
01494 num_vectors= positions->get_num_elements();
01495 ASSERT(num_vectors>0);
01496
01497 int32_t len;
01498
01499
01500
01501 if (single_string)
01502 len=length_of_single_string;
01503 else
01504 {
01505 single_string=features[0].string;
01506 len=max_string_length;
01507 length_of_single_string=max_string_length;
01508 }
01509
01510 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01511 for (int32_t i=0; i<num_vectors; i++)
01512 {
01513 int32_t p=positions->get_element(i);
01514
01515 if (p>=0 && p<=len-window_size)
01516 {
01517 f[i].string=&features[0].string[p+skip];
01518 f[i].length=window_size-skip;
01519 }
01520 else
01521 {
01522 num_vectors=1;
01523 max_string_length=len;
01524 features[0].length=len;
01525 single_string=NULL;
01526 delete[] f;
01527 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01528 window_size, i, p, len);
01529 return -1;
01530 }
01531 }
01532
01533 delete[] features;
01534 features=f;
01535 max_string_length=window_size-skip;
01536
01537 return num_vectors;
01538 }
01539
01551 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01552 {
01553 return obtain_from_char_features(sf, start, p_order, gap, rev);
01554 }
01555
01565 template <class CT>
01566 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01567 {
01568 ASSERT(sf);
01569
01570 CAlphabet* alpha=sf->get_alphabet();
01571 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01572
01573 this->order=p_order;
01574 cleanup();
01575
01576 num_vectors=sf->get_num_vectors();
01577 ASSERT(num_vectors>0);
01578 max_string_length=sf->get_max_vector_length()-start;
01579 features=new T_STRING<ST>[num_vectors];
01580
01581 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01582 alpha->get_num_symbols_in_histogram());
01583
01584 for (int32_t i=0; i<num_vectors; i++)
01585 {
01586 int32_t len=-1;
01587 bool vfree;
01588 CT* c=sf->get_feature_vector(i, len, vfree);
01589 ASSERT(!vfree);
01590
01591 features[i].string=new ST[len];
01592 features[i].length=len;
01593
01594 ST* str=features[i].string;
01595 for (int32_t j=0; j<len; j++)
01596 str[j]=(ST) alpha->remap_to_bin(c[j]);
01597 }
01598
01599 original_num_symbols=alpha->get_num_symbols();
01600 int32_t max_val=alpha->get_num_bits();
01601
01602 SG_UNREF(alpha);
01603
01604 if (p_order>1)
01605 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01606 else
01607 num_symbols=original_num_symbols;
01608 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01609
01610 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01611 {
01612 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01613 return false;
01614 }
01615
01616 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01617 for (int32_t line=0; line<num_vectors; line++)
01618 {
01619 int32_t len=0;
01620 bool vfree;
01621 ST* fv=get_feature_vector(line, len, vfree);
01622 ASSERT(!vfree);
01623
01624 if (rev)
01625 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01626 else
01627 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01628
01629
01630 features[line].length-=start+gap ;
01631 if (features[line].length<0)
01632 features[line].length=0 ;
01633 }
01634
01635 compute_symbol_mask_table(max_val);
01636
01637 return true;
01638 }
01639
01647 bool have_same_length(int32_t len=-1)
01648 {
01649 if (len!=-1)
01650 {
01651 if (len!=get_max_vector_length())
01652 return false;
01653 }
01654 len = get_max_vector_length();
01655
01656 for (int32_t i=0; i<num_vectors; i++)
01657 {
01658 if (get_vector_length(i)!=len)
01659 return false;
01660 }
01661
01662 return true;
01663 }
01664
01669 inline void embed_features(int32_t p_order)
01670 {
01671 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01672
01673 order=p_order;
01674 original_num_symbols=alphabet->get_num_symbols();
01675 int32_t max_val=alphabet->get_num_bits();
01676
01677 if (p_order>1)
01678 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01679 else
01680 num_symbols=original_num_symbols;
01681
01682 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01683
01684 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01685 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01686
01687 ST mask=0;
01688 for (int32_t i=0; i<p_order*max_val; i++)
01689 mask= (mask<<1) | ((ST) 1);
01690
01691 for (int32_t i=0; i<num_vectors; i++)
01692 {
01693 int32_t len=features[i].length;
01694
01695 if (len < p_order)
01696 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01697
01698 ST* str = features[i].string;
01699
01700
01701 for (int32_t j=0; j<p_order; j++)
01702 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01703 str[0]=embed_word(&str[0], p_order);
01704
01705
01706 int32_t idx=0;
01707 for (int32_t j=p_order; j<len; j++)
01708 {
01709 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01710 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01711 idx++;
01712 }
01713
01714 features[i].length=len-p_order+1;
01715 }
01716
01717 compute_symbol_mask_table(max_val);
01718 }
01719
01724 inline void compute_symbol_mask_table(int64_t max_val)
01725 {
01726 delete[] symbol_mask_table;
01727 symbol_mask_table=new ST[256];
01728
01729 uint64_t mask=0;
01730 for (int32_t i=0; i< (int64_t) max_val; i++)
01731 mask=(mask<<1) | 1;
01732
01733 for (int32_t i=0; i<256; i++)
01734 {
01735 uint8_t bits=(uint8_t) i;
01736 symbol_mask_table[i]=0;
01737
01738 for (int32_t j=0; j<8; j++)
01739 {
01740 if (bits & 1)
01741 symbol_mask_table[i]|=mask<<(max_val*j);
01742
01743 bits>>=1;
01744 }
01745 }
01746 }
01747
01754 inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01755 {
01756 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01757
01758 ST mask=0;
01759 for (int32_t i=0; i<nbits; i++)
01760 mask=(mask<<1) | (ST) 1;
01761
01762 for (int32_t i=0; i<len; i++)
01763 {
01764 ST w=(word & mask);
01765 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01766 word>>=nbits;
01767 }
01768 }
01769
01775 inline ST embed_word(ST* seq, int32_t len)
01776 {
01777 ST value=(ST) 0;
01778 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01779 for (int32_t i=0; i<len; i++)
01780 {
01781 value<<=nbits;
01782 value|=seq[i];
01783 }
01784
01785 return value;
01786 }
01787
01790 void determine_maximum_string_length()
01791 {
01792 max_string_length=0;
01793
01794 for (int32_t i=0; i<num_vectors; i++)
01795 max_string_length=CMath::max(max_string_length, features[i].length);
01796 }
01797
01805 static ST* get_zero_terminated_string_copy(T_STRING<ST> str)
01806 {
01807 int32_t l=str.length;
01808 ST* s=new ST[l+1];
01809 memcpy(s, str.string, sizeof(ST)*l);
01810 s[l]='\0';
01811 return s;
01812 }
01813
01820 virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01821 {
01822 ASSERT(features);
01823 ASSERT(num<num_vectors);
01824
01825 features[num].length=len ;
01826 features[num].string=string ;
01827
01828 max_string_length=CMath::max(len, max_string_length);
01829 }
01830
01831
01834 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true)
01835 {
01836 int32_t nsym=get_num_symbols();
01837 int32_t slen=get_max_vector_length();
01838 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01839 float64_t* h= (float64_t*) malloc(sz);
01840 ASSERT(h);
01841 memset(h, 0, sz);
01842
01843 float64_t* h_normalizer=new float64_t[slen];
01844 memset(h_normalizer, 0, slen*sizeof(float64_t));
01845 int32_t num_str=get_num_vectors();
01846 for (int32_t i=0; i<num_str; i++)
01847 {
01848 int32_t len;
01849 bool free_vec;
01850 ST* vec=get_feature_vector(i, len, free_vec);
01851 for (int32_t j=0; j<len; j++)
01852 {
01853 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01854 h_normalizer[j]++;
01855 }
01856 free_feature_vector(vec, i, free_vec);
01857 }
01858
01859 if (normalize)
01860 {
01861 for (int32_t i=0; i<slen; i++)
01862 {
01863 for (int32_t j=0; j<nsym; j++)
01864 {
01865 if (h_normalizer && h_normalizer[i])
01866 h[int64_t(i)*nsym+j]/=h_normalizer[i];
01867 }
01868 }
01869 }
01870 delete[] h_normalizer;
01871
01872 *hist=h;
01873 *rows=nsym;
01874 *cols=slen;
01875 }
01876
01879 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01880 {
01881 ASSERT(rows == get_num_symbols());
01882 cleanup();
01883 float64_t* randoms=new float64_t[cols];
01884 T_STRING<ST>* sf=new T_STRING<ST>[num_vec];
01885
01886 for (int32_t i=0; i<num_vec; i++)
01887 {
01888 sf[i].string=new ST[cols];
01889 sf[i].length=cols;
01890
01891 CMath::random_vector(randoms, cols, 0.0, 1.0);
01892
01893 for (int32_t j=0; j<cols; j++)
01894 {
01895 float64_t lik=hist[int64_t(j)*rows+0];
01896
01897 int32_t c;
01898 for (c=0; c<rows-1; c++)
01899 {
01900 if (randoms[j]<=lik)
01901 break;
01902 lik+=hist[int64_t(j)*rows+c+1];
01903 }
01904 sf[i].string[j]=alphabet->remap_to_char(c);
01905 }
01906 }
01907 delete[] randoms;
01908 set_features(sf, num_vec, cols);
01909 }
01910
01912 inline virtual const char* get_name() const { return "StringFeatures"; }
01913
01914 protected:
01915
01926 virtual ST* compute_feature_vector(int32_t num, int32_t& len)
01927 {
01928 ASSERT(features && num<num_vectors);
01929
01930 len=features[num].length;
01931 if (len<=0)
01932 return NULL;
01933
01934 ST* target=new ST[len];
01935 memcpy(target, features[num].string, len*sizeof(ST));
01936 return target;
01937 }
01938
01939 #ifdef HAVE_BOOST_SERIALIZATION
01940 private:
01941
01942 friend class ::boost::serialization::access;
01943 template<class Archive>
01944 void save(Archive & ar, const unsigned int archive_version) const
01945 {
01946
01947 SG_DEBUG("archiving StringFeatures\n");
01948
01949 ar & ::boost::serialization::base_object<CFeatures>(*this);
01950
01951 ar & alphabet;
01952
01953 ar & num_vectors;
01954 for (int i=0; i < num_vectors; ++i) {
01955 ar & features[i];
01956 }
01957
01958 ar & length_of_single_string;
01959 for (int i=0; i < length_of_single_string; ++i) {
01960 ar & single_string[i];
01961 }
01962
01963 ar & max_string_length;
01964 ar & num_symbols;
01965 ar & original_num_symbols;
01966 ar & order;
01967
01969
01970
01971
01972 SG_DEBUG("done archiving StringFeatures\n");
01973
01974 }
01975
01976 template<class Archive>
01977 void load(Archive & ar, const unsigned int archive_version)
01978 {
01979
01980 SG_DEBUG("archiving StringFeatures\n");
01981
01982 ar & ::boost::serialization::base_object<CFeatures>(*this);
01983
01984
01985 ar & alphabet;
01986
01987 ar & num_vectors;
01988
01989
01990 features = new T_STRING<ST>[num_vectors];
01991 for (int i=0; i < num_vectors; ++i) {
01992 ar & features[i];
01993 }
01994
01995
01996 ar & length_of_single_string;
01997
01998
01999 single_string = new ST[length_of_single_string];
02000 for (int i=0; i < length_of_single_string; ++i) {
02001 ar & single_string[i];
02002 }
02003
02004 ar & max_string_length;
02005 ar & num_symbols;
02006 ar & original_num_symbols;
02007 ar & order;
02008
02010
02011
02012
02013 SG_DEBUG("done archiving StringFeatures\n");
02014
02015 }
02016
02017 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
02018
02019
02020 #endif //HAVE_BOOST_SERIALIZATION
02021
02022
02023 protected:
02024
02026 CAlphabet* alphabet;
02027
02029 int32_t num_vectors;
02030
02032 T_STRING<ST>* features;
02033
02035 ST* single_string;
02036
02038 int32_t length_of_single_string;
02039
02041 int32_t max_string_length;
02042
02044 floatmax_t num_symbols;
02045
02047 floatmax_t original_num_symbols;
02048
02050 int32_t order;
02051
02053 ST* symbol_mask_table;
02054
02056 bool preprocess_on_get;
02057
02059 CCache<ST>* feature_cache;
02060 };
02061
02062 #ifndef DOXYGEN_SHOULD_SKIP_THIS
02063
02067 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
02068 {
02069 return F_BOOL;
02070 }
02071
02076 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
02077 {
02078 return F_CHAR;
02079 }
02080
02085 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
02086 {
02087 return F_BYTE;
02088 }
02089
02094 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
02095 {
02096 return F_SHORT;
02097 }
02098
02103 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
02104 {
02105 return F_WORD;
02106 }
02107
02112 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
02113 {
02114 return F_INT;
02115 }
02116
02121 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
02122 {
02123 return F_UINT;
02124 }
02125
02130 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
02131 {
02132 return F_LONG;
02133 }
02134
02139 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
02140 {
02141 return F_ULONG;
02142 }
02143
02148 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
02149 {
02150 return F_SHORTREAL;
02151 }
02152
02157 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
02158 {
02159 return F_DREAL;
02160 }
02161
02166 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
02167 {
02168 return F_LONGREAL;
02169 }
02170
02171 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
02172 {
02173 return symbol;
02174 }
02175 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
02176 {
02177 return symbol;
02178 }
02179 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
02180 {
02181 return symbol;
02182 }
02183 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
02184 {
02185 return symbol;
02186 }
02187
02188 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02189 {
02190 return false;
02191 }
02192 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02193 {
02194 return 0;
02195 }
02196 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02197 {
02198 return 0;
02199 }
02200 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02201 {
02202 return 0;
02203 }
02204
02205 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02206 {
02207 return symbol;
02208 }
02209 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02210 {
02211 return symbol;
02212 }
02213 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02214 {
02215 return symbol;
02216 }
02217 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02218 {
02219 return symbol;
02220 }
02221
02222 #ifndef SUNOS
02223 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02224 {
02225 return false;
02226 }
02227 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02228 {
02229 return false;
02230 }
02231 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02232 {
02233 return false;
02234 }
02235 #endif
02236
02237 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02238 {
02239 }
02240 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02241 {
02242 }
02243 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02244 {
02245 }
02246
02247 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02248 {
02249 }
02250 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02251 {
02252 }
02253 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02254 {
02255 }
02256
02257 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02258 {
02259 return 0;
02260 }
02261 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02262 {
02263 return 0;
02264 }
02265 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02266 {
02267 return 0;
02268 }
02269
02270 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02271 {
02272 }
02273 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02274 {
02275 }
02276 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02277 {
02278 }
02279 #define LOAD(f_load, sg_type) \
02280 template<> inline void CStringFeatures<sg_type>::load(CFile* loader) \
02281 { \
02282 SG_INFO( "loading...\n"); \
02283 \
02284 T_STRING<sg_type>* strs; \
02285 int32_t num_str; \
02286 int32_t max_len; \
02287 loader->f_load(strs, num_str, max_len); \
02288 set_features(strs, num_str, max_len); \
02289 }
02290
02291 LOAD(get_bool_string_list, bool)
02292 LOAD(get_char_string_list, char)
02293 LOAD(get_byte_string_list, uint8_t)
02294 LOAD(get_short_string_list, int16_t)
02295 LOAD(get_word_string_list, uint16_t)
02296 LOAD(get_int_string_list, int32_t)
02297 LOAD(get_uint_string_list, uint32_t)
02298 LOAD(get_long_string_list, int64_t)
02299 LOAD(get_ulong_string_list, uint64_t)
02300 LOAD(get_shortreal_string_list, float32_t)
02301 LOAD(get_real_string_list, float64_t)
02302 LOAD(get_longreal_string_list, floatmax_t)
02303 #undef LOAD
02304
02305 #define SAVE(f_write, sg_type) \
02306 template<> inline void CStringFeatures<sg_type>::save(CFile* writer) \
02307 { \
02308 ASSERT(writer); \
02309 writer->f_write(features, num_vectors); \
02310 }
02311
02312 SAVE(set_bool_string_list, bool)
02313 SAVE(set_char_string_list, char)
02314 SAVE(set_byte_string_list, uint8_t)
02315 SAVE(set_short_string_list, int16_t)
02316 SAVE(set_word_string_list, uint16_t)
02317 SAVE(set_int_string_list, int32_t)
02318 SAVE(set_uint_string_list, uint32_t)
02319 SAVE(set_long_string_list, int64_t)
02320 SAVE(set_ulong_string_list, uint64_t)
02321 SAVE(set_shortreal_string_list, float32_t)
02322 SAVE(set_real_string_list, float64_t)
02323 SAVE(set_longreal_string_list, floatmax_t)
02324 #undef SAVE
02325 #endif // DOXYGEN_SHOULD_SKIP_THIS
02326 }
02327 #endif // _CSTRINGFEATURES__H__