StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014 
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Cache.h"
00018 #include "lib/DynamicArray.h"
00019 #include "lib/File.h"
00020 #include "lib/MemoryMappedFile.h"
00021 #include "lib/Mathematics.h"
00022 #include "lib/Compressor.h"
00023 
00024 #include "preproc/PreProc.h"
00025 #include "preproc/StringPreProc.h"
00026 #include "features/Features.h"
00027 #include "features/Alphabet.h"
00028 
00029 #include <sys/types.h>
00030 #include <sys/stat.h>
00031 #include <dirent.h>
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <unistd.h>
00035 
00036 namespace shogun
00037 {
00038 class CCompressor;
00039 enum E_COMPRESSION_TYPE;
00040 class CAlphabet;
00041 enum EAlphabet;
00042 template <class T> class CDynamicArray;
00043 class CFile;
00044 template <class T> class CMemoryMappedFile;
00045 class CMath;
00046 template <class ST> class CStringPreProc;
00047 template <class T> class T_STRING;
00048 
00049 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00050 
00051 template <class T> class T_STRING
00052 {
00053 
00054 
00055 #ifdef HAVE_BOOST_SERIALIZATION
00056 
00057   private:
00058 
00059 
00060   friend class ::boost::serialization::access;
00061   template<class Archive>
00062   void save(Archive & ar, const unsigned int archive_version) const
00063   {
00064 
00065     //SG_DEBUG("archiving T_STRING\n");
00066 
00067     ar & length;
00068 
00069     for (int i=0; i < length; ++i) {
00070       ar & string[i];
00071     }
00072 
00073     //SG_DEBUG("done archiving T_STRING\n");
00074 
00075   }
00076 
00077   template<class Archive>
00078   void load(Archive & ar, const unsigned int archive_version)
00079   {
00080 
00081     //SG_DEBUG("archiving T_STRING\n");
00082 
00083     ar & length;
00084 
00085     string = new T[length];
00086 
00087     for (int i=0; i < length; ++i) {
00088       ar & string[i];
00089     }
00090 
00091     //SG_DEBUG("done archiving T_STRING\n");
00092 
00093   }
00094 
00095   GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00096 
00097 
00098 #endif //HAVE_BOOST_SERIALIZATION
00099 
00100     public:
00102     T* string;
00104     int32_t length;
00105 };
00106 #endif // DOXYGEN_SHOULD_SKIP_THIS
00107 
00108 
00127 template <class ST> class CStringFeatures : public CFeatures
00128 {
00129     public:
00133         CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0),
00134         features(NULL), single_string(NULL),length_of_single_string(0),
00135         max_string_length(0), order(0), symbol_mask_table(NULL),
00136         preprocess_on_get(false), feature_cache(NULL)
00137         {
00138         }
00139 
00144         CStringFeatures(EAlphabet alpha)
00145         : CFeatures(0), num_vectors(0), features(NULL),
00146             single_string(NULL),length_of_single_string(0),
00147             max_string_length(0), order(0), symbol_mask_table(NULL),
00148             preprocess_on_get(false), feature_cache(NULL)
00149         {
00150             alphabet=new CAlphabet(alpha);
00151             SG_REF(alphabet);
00152             num_symbols=alphabet->get_num_symbols();
00153             original_num_symbols=num_symbols;
00154         }
00155 
00163         CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00164                 int32_t p_max_string_length, EAlphabet alpha)
00165         : CFeatures(0), num_vectors(0), features(NULL),
00166             single_string(NULL),length_of_single_string(0),
00167             max_string_length(0), order(0), symbol_mask_table(NULL),
00168             preprocess_on_get(false), feature_cache(NULL)
00169         {
00170             alphabet=new CAlphabet(alpha);
00171             SG_REF(alphabet);
00172             num_symbols=alphabet->get_num_symbols();
00173             original_num_symbols=num_symbols;
00174             set_features(p_features, p_num_vectors, p_max_string_length);
00175         }
00176 
00184         CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00185                 int32_t p_max_string_length, CAlphabet* alpha)
00186         : CFeatures(0), num_vectors(0), features(NULL),
00187             single_string(NULL),length_of_single_string(0),
00188             max_string_length(0), order(0), symbol_mask_table(NULL),
00189             preprocess_on_get(false), feature_cache(NULL)
00190         {
00191             alphabet=new CAlphabet(alpha);
00192             SG_REF(alphabet);
00193             num_symbols=alphabet->get_num_symbols();
00194             original_num_symbols=num_symbols;
00195             set_features(p_features, p_num_vectors, p_max_string_length);
00196         }
00197 
00202         CStringFeatures(CAlphabet* alpha)
00203         : CFeatures(0), num_vectors(0), features(NULL),
00204             single_string(NULL),length_of_single_string(0),
00205             max_string_length(0), order(0), symbol_mask_table(NULL),
00206             preprocess_on_get(false), feature_cache(NULL)
00207         {
00208             ASSERT(alpha);
00209             SG_REF(alpha);
00210             alphabet=alpha;
00211             num_symbols=alphabet->get_num_symbols();
00212             original_num_symbols=num_symbols;
00213         }
00214 
00216         CStringFeatures(const CStringFeatures & orig)
00217         : CFeatures(orig), num_vectors(orig.num_vectors),
00218             single_string(orig.single_string),
00219             length_of_single_string(orig.length_of_single_string),
00220             max_string_length(orig.max_string_length),
00221             num_symbols(orig.num_symbols),
00222             original_num_symbols(orig.original_num_symbols),
00223             order(orig.order), preprocess_on_get(false),
00224             feature_cache(NULL)
00225         {
00226             ASSERT(orig.single_string == NULL); //not implemented
00227 
00228             alphabet=orig.alphabet;
00229             SG_REF(alphabet);
00230 
00231             if (orig.features)
00232             {
00233                 features=new T_STRING<ST>[orig.num_vectors];
00234 
00235                 for (int32_t i=0; i<num_vectors; i++)
00236                 {
00237                     features[i].string=new ST[orig.features[i].length];
00238                     features[i].length=orig.features[i].length;
00239                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00240                 }
00241             }
00242 
00243             if (orig.symbol_mask_table)
00244             {
00245                 symbol_mask_table=new ST[256];
00246                 for (int32_t i=0; i<256; i++)
00247                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00248             }
00249         }
00250 
00256         CStringFeatures(CFile* loader, EAlphabet alpha=DNA)
00257         : CFeatures(loader), num_vectors(0), features(NULL), single_string(NULL),
00258             length_of_single_string(0), max_string_length(0), order(0),
00259             symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00260         {
00261             alphabet=new CAlphabet(alpha);
00262             SG_REF(alphabet);
00263             num_symbols=alphabet->get_num_symbols();
00264             original_num_symbols=num_symbols;
00265             load(loader);
00266         }
00267 
00268         virtual ~CStringFeatures()
00269         {
00270             cleanup();
00271 
00272             SG_UNREF(alphabet);
00273         }
00274 
00276         virtual void cleanup()
00277         {
00278             if (single_string)
00279             {
00280                 delete[] single_string;
00281                 single_string=NULL;
00282             }
00283             else
00284             {
00285                 for (int32_t i=0; i<num_vectors; i++)
00286                     cleanup_feature_vector(i);
00287             }
00288 
00289             num_vectors=0;
00290             delete[] features;
00291             delete[] symbol_mask_table;
00292             features=NULL;
00293             symbol_mask_table=NULL;
00294 
00295             /* start with a fresh alphabet, but instead of emptying the histogram
00296              * create a new object (to leave the alphabet object alone if it is used
00297              * by others)
00298              */
00299             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00300             SG_UNREF(alphabet);
00301             alphabet=alpha;
00302             SG_REF(alphabet);
00303         }
00304 
00306         virtual void cleanup_feature_vector(int32_t num)
00307         {
00308             ASSERT(num<num_vectors);
00309             if (features)
00310             {
00311                 delete[] features[num].string;
00312                 features[num].string=NULL;
00313                 features[num].length=0;
00314             }
00315         }
00316 
00321         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00322 
00327         inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00328 
00333         inline CAlphabet* get_alphabet()
00334         {
00335             SG_REF(alphabet);
00336             return alphabet;
00337         }
00338 
00343         virtual CFeatures* duplicate() const
00344         {
00345             return new CStringFeatures<ST>(*this);
00346         }
00347 
00354         void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00355         {
00356             ASSERT(features);
00357             if (num>=num_vectors)
00358             {
00359                 SG_ERROR("Index out of bounds (number of strings %d, you "
00360                         "requested %d)\n", num_vectors, num);
00361             }
00362 
00363             int32_t l;
00364             bool free_vec;
00365             ST* vec=get_feature_vector(num, l, free_vec);
00366             *len=l;
00367             *dst=(ST*) malloc(*len * sizeof(ST));
00368             ASSERT(*dst);
00369             memcpy(*dst, vec, *len * sizeof(ST));
00370             free_feature_vector(vec, num, free_vec);
00371         }
00372 
00379         void set_feature_vector(ST* src, int32_t len, int32_t num)
00380         {
00381             ASSERT(features);
00382             if (num>=num_vectors)
00383             {
00384                 SG_ERROR("Index out of bounds (number of strings %d, you "
00385                         "requested %d)\n", num_vectors, num);
00386             }
00387 
00388             if (len<=0)
00389                 SG_ERROR("String has zero or negative length\n");
00390 
00391 
00392             cleanup_feature_vector(num);
00393             features[num].length=len;
00394             features[num].string=new ST[len];
00395             memcpy(features[num].string, src, len*sizeof(ST));
00396 
00397             determine_maximum_string_length();
00398         }
00399 
00402         void enable_on_the_fly_preprocessing()
00403         {
00404             preprocess_on_get=true;
00405         }
00406 
00410         void disable_on_the_fly_preprocessing()
00411         {
00412             preprocess_on_get=false;
00413         }
00414 
00423         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00424         {
00425             ASSERT(features);
00426             ASSERT(num<num_vectors);
00427 
00428             if (!preprocess_on_get)
00429             {
00430                 dofree=false;
00431                 len=features[num].length;
00432                 return features[num].string;
00433             }
00434             else
00435             {
00436                 SG_DEBUG( "computing feature vector!\n") ;
00437                 ST* feat=compute_feature_vector(num, len);
00438                 dofree=true;
00439 
00440                 if (get_num_preproc())
00441                 {
00442                     ST* tmp_feat_before = feat;
00443 
00444                     for (int32_t i=0; i<get_num_preproc(); i++)
00445                     {
00446                         CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00447                         feat=p->apply_to_string(tmp_feat_before, len);
00448                         SG_UNREF(p);
00449                         delete[] tmp_feat_before;
00450                         tmp_feat_before=feat;
00451                     }
00452                 }
00453                 // TODO: implement caching
00454                 return feat;
00455             }
00456         }
00457 
00462         CStringFeatures<ST>* get_transposed()
00463         {
00464             int32_t num_feat;
00465             int32_t num_vec;
00466             T_STRING<ST>* s=get_transposed(num_feat, num_vec);
00467 
00468             return new CStringFeatures<ST>(s, num_vec, num_feat, alphabet);
00469         }
00470 
00482         T_STRING<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00483         {
00484             num_feat=num_vectors;
00485             num_vec=get_max_vector_length();
00486             ASSERT(have_same_length());
00487 
00488             SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00489                     int64_t(num_feat)*num_vec);
00490 
00491             T_STRING<ST>* sf=new T_STRING<ST>[num_vec];
00492 
00493             for (int32_t i=0; i<num_vec; i++)
00494             {
00495                 sf[i].string=new ST[num_feat];
00496                 sf[i].length=num_feat;
00497             }
00498 
00499             for (int32_t i=0; i<num_feat; i++)
00500             {
00501                 int32_t len=0;
00502                 bool free_vec=false;
00503                 ST* vec=get_feature_vector(i, len, free_vec);
00504 
00505                 for (int32_t j=0; j<num_vec; j++)
00506                     sf[j].string[i]=vec[j];
00507 
00508                 free_feature_vector(vec, i, free_vec);
00509             }
00510             return sf;
00511         }
00512 
00519         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00520         {
00521             if (feature_cache)
00522                 feature_cache->unlock_entry(num);
00523 
00524             if (dofree)
00525                 delete[] feat_vec ;
00526         }
00527 
00534         virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00535         {
00536             int32_t len;
00537             bool free_vec;
00538             ST* vec=get_feature_vector(vec_num, len, free_vec);
00539             ASSERT(feat_num<len);
00540             ST result=vec[feat_num];
00541             free_feature_vector(vec, vec_num, free_vec);
00542 
00543             return result;
00544         }
00545 
00551         virtual inline int32_t get_vector_length(int32_t vec_num)
00552         {
00553             int32_t len;
00554             bool free_vec;
00555             ST* vec=get_feature_vector(vec_num, len, free_vec);
00556             free_feature_vector(vec, vec_num, free_vec);
00557             return len;
00558         }
00559 
00564         virtual inline int32_t get_max_vector_length()
00565         {
00566             return max_string_length;
00567         }
00568 
00573         virtual inline int32_t get_num_vectors() { return num_vectors; }
00574 
00581         inline floatmax_t get_num_symbols() { return num_symbols; }
00582 
00590         inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00591 
00592         // these functions are necessary to find out about a former conversion process
00593 
00598         inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00599 
00604         inline int32_t get_order() { return order; }
00605 
00613         inline ST get_masked_symbols(ST symbol, uint8_t mask)
00614         {
00615             ASSERT(symbol_mask_table);
00616             return symbol_mask_table[mask] & symbol;
00617         }
00618 
00625         inline ST shift_offset(ST offset, int32_t amount)
00626         {
00627             ASSERT(alphabet);
00628             return (offset << (amount*alphabet->get_num_bits()));
00629         }
00630 
00637         inline ST shift_symbol(ST symbol, int32_t amount)
00638         {
00639             ASSERT(alphabet);
00640             return (symbol >> (amount*alphabet->get_num_bits()));
00641         }
00642 
00647         virtual inline void load(CFile* loader);
00648 
00657         void load_ascii_file(char* fname, bool remap_to_bin=true,
00658                 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
00659         {
00660             size_t blocksize=1024*1024;
00661             size_t required_blocksize=0;
00662             uint8_t* dummy=new uint8_t[blocksize];
00663             uint8_t* overflow=NULL;
00664             int32_t overflow_len=0;
00665 
00666             cleanup();
00667 
00668             CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00669             CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00670 
00671             FILE* f=fopen(fname, "ro");
00672 
00673             if (f)
00674             {
00675                 num_vectors=0;
00676                 max_string_length=0;
00677 
00678                 SG_INFO("counting line numbers in file %s\n", fname);
00679                 size_t block_offs=0;
00680                 size_t old_block_offs=0;
00681                 fseek(f, 0, SEEK_END);
00682                 size_t fsize=ftell(f);
00683                 rewind(f);
00684 
00685                 if (blocksize>fsize)
00686                     blocksize=fsize;
00687 
00688                 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00689 
00690                 size_t sz=blocksize;
00691                 while (sz == blocksize)
00692                 {
00693                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00694                     bool contains_cr=false;
00695                     for (size_t i=0; i<sz; i++)
00696                     {
00697                         block_offs++;
00698                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00699                         {
00700                             num_vectors++;
00701                             contains_cr=true;
00702                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00703                             old_block_offs=block_offs;
00704                         }
00705                     }
00706                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00707                 }
00708 
00709                 SG_INFO("found %d strings\n", num_vectors);
00710                 delete[] dummy;
00711                 blocksize=required_blocksize;
00712                 dummy = new uint8_t[blocksize];
00713                 overflow = new uint8_t[blocksize];
00714                 features=new T_STRING<ST>[num_vectors];
00715 
00716                 rewind(f);
00717                 sz=blocksize;
00718                 int32_t lines=0;
00719                 while (sz == blocksize)
00720                 {
00721                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00722 
00723                     size_t old_sz=0;
00724                     for (size_t i=0; i<sz; i++)
00725                     {
00726                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00727                         {
00728                             int32_t len=i-old_sz;
00729                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00730                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00731 
00732                             features[lines].length=len;
00733                             features[lines].string=new ST[len];
00734 
00735                             if (remap_to_bin)
00736                             {
00737                                 for (int32_t j=0; j<overflow_len; j++)
00738                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00739                                 for (int32_t j=0; j<len; j++)
00740                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00741                                 alpha->add_string_to_histogram(&dummy[old_sz], len);
00742                                 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00743                             }
00744                             else
00745                             {
00746                                 for (int32_t j=0; j<overflow_len; j++)
00747                                     features[lines].string[j]=overflow[j];
00748                                 for (int32_t j=0; j<len; j++)
00749                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00750                                 alpha->add_string_to_histogram(&dummy[old_sz], len);
00751                                 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00752                             }
00753 
00754                             // clear overflow
00755                             overflow_len=0;
00756 
00757                             //CMath::display_vector(features[lines].string, len);
00758                             old_sz=i+1;
00759                             lines++;
00760                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00761                         }
00762                     }
00763                     for (size_t i=old_sz; i<sz; i++)
00764                         overflow[i-old_sz]=dummy[i];
00765 
00766                     overflow_len=sz-old_sz;
00767                 }
00768 
00769                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00770                 {
00771                     SG_INFO("file successfully read\n");
00772                     SG_INFO("max_string_length=%d\n", max_string_length);
00773                     SG_INFO("num_strings=%d\n", num_vectors);
00774                 }
00775                 fclose(f);
00776             }
00777 
00778             delete[] dummy;
00779 
00780             SG_UNREF(alphabet);
00781 
00782             if (remap_to_bin)
00783                 alphabet = alpha_bin;
00784             else
00785                 alphabet = alpha;
00786             SG_REF(alphabet);
00787             num_symbols=alphabet->get_num_symbols();
00788         }
00789 
00796         bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00797         {
00798             int32_t i=0;
00799             uint64_t len=0;
00800             uint64_t offs=0;
00801             int32_t num=0;
00802             int32_t max_len=0;
00803 
00804             CMemoryMappedFile<char> f(fname);
00805 
00806             while (true)
00807             {
00808                 char* s=f.get_line(len, offs);
00809                 if (!s)
00810                     break;
00811 
00812                 if (len>0 && s[0]=='>')
00813                     num++;
00814             }
00815 
00816             if (num==0)
00817                 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00818 
00819             cleanup();
00820             SG_UNREF(alphabet);
00821             alphabet=new CAlphabet(DNA);
00822             num_symbols=alphabet->get_num_symbols();
00823 
00824             T_STRING<ST>* strings=new T_STRING<ST>[num];
00825             offs=0;
00826 
00827             for (i=0;i<num; i++)
00828             {
00829                 uint64_t id_len=0;
00830                 char* id=f.get_line(id_len, offs);
00831 
00832                 char* fasta=f.get_line(len, offs);
00833                 char* s=fasta;
00834                 int32_t fasta_len=0;
00835                 int32_t spanned_lines=0;
00836 
00837                 while (true)
00838                 {
00839                     if (!s || len==0)
00840                         SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00841 
00842                     if (s[0]=='>' || offs==f.get_size())
00843                     {
00844                         offs-=len+1; // seek to beginning
00845                         if (offs==f.get_size())
00846                         {
00847                             SG_DEBUG("at EOF\n");
00848                             fasta_len+=len;
00849                         }
00850 
00851                         len = fasta_len-spanned_lines;
00852                         strings[i].string=new ST[len];
00853                         strings[i].length=len;
00854 
00855                         ST* str=strings[i].string;
00856                         int32_t idx=0;
00857                         SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00858 
00859                         for (int32_t j=0; j<fasta_len; j++)
00860                         {
00861                             if (fasta[j]=='\n')
00862                                 continue;
00863 
00864                             ST c = (ST) fasta[j];
00865 
00866                             if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00867                                 c = (ST) 'A';
00868 
00869                             if (idx>=len)
00870                                 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00871                             str[idx++]=c;
00872                         }
00873                         max_len=CMath::max(max_len, strings[i].length);
00874 
00875 
00876                         break;
00877                     }
00878 
00879                     spanned_lines++;
00880                     fasta_len+=len+1; // including '\n'
00881                     s=f.get_line(len, offs);
00882                 }
00883             }
00884 
00885             return set_features(strings, num, max_len);
00886         }
00887 
00895         bool load_fastq_file(const char* fname,
00896                 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00897         {
00898             CMemoryMappedFile<char> f(fname);
00899 
00900             int32_t i=0;
00901             uint64_t len=0;
00902             uint64_t offs=0;
00903 
00904             int32_t num=f.get_num_lines();
00905             int32_t max_len=0;
00906 
00907             if (num%4)
00908                 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00909             num/=4;
00910 
00911             cleanup();
00912             SG_UNREF(alphabet);
00913             alphabet=new CAlphabet(DNA);
00914 
00915             T_STRING<ST>* strings;
00916 
00917             ST* str;
00918             if (bitremap_in_single_string)
00919             {
00920                 strings=new T_STRING<ST>[1];
00921                 strings[0].string=new ST[num];
00922                 strings[0].length=num;
00923                 f.get_line(len, offs);
00924                 f.get_line(len, offs);
00925                 order=len;
00926                 max_len=num;
00927                 offs=0;
00928                 original_num_symbols=alphabet->get_num_symbols();
00929                 int32_t max_val=alphabet->get_num_bits();
00930                 str=new ST[len];
00931             }
00932             else
00933                 strings=new T_STRING<ST>[num];
00934 
00935             for (i=0;i<num; i++)
00936             {
00937                 if (!f.get_line(len, offs))
00938                     SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00939 
00940                 char* s=f.get_line(len, offs);
00941                 if (!s || len==0)
00942                     SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00943 
00944                 if (bitremap_in_single_string)
00945                 {
00946                     if (len!=order)
00947                         SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00948                     for (int32_t j=0; j<order; j++)
00949                         str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00950 
00951                     strings[0].string[i]=embed_word(str, order);
00952                 }
00953                 else
00954                 {
00955                     strings[i].string=new ST[len];
00956                     strings[i].length=len;
00957                     str=strings[i].string;
00958 
00959                     if (ignore_invalid)
00960                     {
00961                         for (int32_t j=0; j<len; j++)
00962                         {
00963                             if (alphabet->is_valid((uint8_t) s[j]))
00964                                 str[j]= (ST) s[j];
00965                             else
00966                                 str[j]= (ST) 'A';
00967                         }
00968                     }
00969                     else
00970                     {
00971                         for (int32_t j=0; j<len; j++)
00972                             str[j]= (ST) s[j];
00973                     }
00974                     max_len=CMath::max(max_len, (int32_t) len);
00975                 }
00976 
00977 
00978                 if (!f.get_line(len, offs))
00979                     SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00980 
00981                 if (!f.get_line(len, offs))
00982                     SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00983             }
00984 
00985             if (bitremap_in_single_string)
00986                 num=1;
00987 
00988             num_vectors=num;
00989             max_string_length=max_len;
00990             features=strings;
00991 
00992             return true;
00993         }
00994 
01000         bool load_from_directory(char* dirname)
01001         {
01002             struct dirent **namelist;
01003             int32_t n;
01004 
01005             CIO::set_dirname(dirname);
01006 
01007             SG_DEBUG("dirname '%s'\n", dirname);
01008 
01009             n = scandir(dirname, &namelist, &CIO::filter, alphasort);
01010             if (n <= 0)
01011             {
01012                 SG_ERROR("error calling scandir - no files found\n");
01013                 return false;
01014             }
01015             else
01016             {
01017                 T_STRING<ST>* strings=NULL;
01018 
01019                 int32_t num=0;
01020                 int32_t max_len=-1;
01021 
01022                 //usually n==num_vec, but it might not in race conditions
01023                 //(file perms modified, file erased)
01024                 strings=new T_STRING<ST>[n];
01025 
01026                 for (int32_t i=0; i<n; i++)
01027                 {
01028                     char* fname=CIO::concat_filename(namelist[i]->d_name);
01029 
01030                     struct stat s;
01031                     off_t filesize=0;
01032 
01033                     if (!stat(fname, &s) && s.st_size>0)
01034                     {
01035                         filesize=s.st_size/sizeof(ST);
01036 
01037                         FILE* f=fopen(fname, "ro");
01038                         if (f)
01039                         {
01040                             ST* str=new ST[filesize];
01041                             SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01042                             fread(str, sizeof(ST), filesize, f);
01043                             strings[num].string=str;
01044                             strings[num].length=filesize;
01045                             max_len=CMath::max(max_len, strings[num].length);
01046 
01047                             num++;
01048                             fclose(f);
01049                         }
01050                     }
01051                     else
01052                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
01053 
01054                     free(namelist[i]);
01055                 }
01056                 free(namelist);
01057 
01058                 if (num>0 && strings)
01059                 {
01060                     set_features(strings, num, max_len);
01061                     return true;
01062                 }
01063             }
01064             return false;
01065         }
01066 
01074         bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01075         {
01076             if (p_features)
01077             {
01078                 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01079 
01080                 //compute histogram for char/byte
01081                 for (int32_t i=0; i<p_num_vectors; i++)
01082                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01083 
01084                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01085                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01086 
01087                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01088                 {
01089                     cleanup();
01090                     SG_UNREF(alphabet);
01091 
01092                     alphabet=alpha;
01093                     SG_REF(alphabet);
01094 
01095                     this->features=p_features;
01096                     this->num_vectors=p_num_vectors;
01097                     this->max_string_length=p_max_string_length;
01098 
01099                     return true;
01100                 }
01101                 else
01102                     SG_UNREF(alpha);
01103             }
01104 
01105             return false;
01106         }
01107 
01113         bool append_features(CStringFeatures<ST>* sf)
01114         {
01115             ASSERT(sf);
01116             T_STRING<ST>* new_features = new T_STRING<ST>[sf->num_vectors];
01117 
01118             for (int32_t i=0; i<sf->num_vectors; i++)
01119             {
01120                 int32_t length=sf->features[i].length;
01121                 new_features[i].string=new ST[length];
01122                 memcpy(new_features[i].string, sf->features[i].string, length);
01123                 new_features[i].length=length;
01124             }
01125             return append_features(new_features, sf->num_vectors,
01126                     sf->max_string_length);
01127         }
01128 
01139         bool append_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01140         {
01141             if (!features)
01142                 return set_features(p_features, p_num_vectors, p_max_string_length);
01143 
01144             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01145 
01146             //compute histogram for char/byte
01147             for (int32_t i=0; i<p_num_vectors; i++)
01148                 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01149 
01150             SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01151             SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01152 
01153             if (alpha->check_alphabet_size() && alpha->check_alphabet())
01154             {
01155                 SG_UNREF(alpha);
01156                 for (int32_t i=0; i<p_num_vectors; i++)
01157                     alphabet->add_string_to_histogram( p_features[i].string, p_features[i].length);
01158 
01159                 int32_t old_num_vectors=num_vectors;
01160                 num_vectors=old_num_vectors+p_num_vectors;
01161                 T_STRING<ST>* new_features = new T_STRING<ST>[num_vectors];
01162 
01163                 for (int32_t i=0; i<num_vectors; i++)
01164                 {
01165                     if (i<old_num_vectors)
01166                     {
01167                         new_features[i].string=features[i].string;
01168                         new_features[i].length=features[i].length;
01169                     }
01170                     else
01171                     {
01172                         new_features[i].string=p_features[i-old_num_vectors].string;
01173                         new_features[i].length=p_features[i-old_num_vectors].length;
01174                     }
01175                 }
01176                 delete[] features;
01177                 delete[] p_features; // free now obsolete features
01178 
01179                 this->features=new_features;
01180                 this->max_string_length=CMath::max(max_string_length, p_max_string_length);
01181 
01182                 return true;
01183             }
01184             SG_UNREF(alpha);
01185 
01186             return false;
01187         }
01188 
01195         virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01196         {
01197             num_str=num_vectors;
01198             max_str_len=max_string_length;
01199             return features;
01200         }
01201 
01208         virtual T_STRING<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01209         {
01210             ASSERT(num_vectors>0);
01211 
01212             num_str=num_vectors;
01213             max_str_len=max_string_length;
01214             T_STRING<ST>* new_feat=new T_STRING<ST>[num_str];
01215 
01216             for (int32_t i=0; i<num_str; i++)
01217             {
01218                 int32_t len;
01219                 bool free_vec;
01220                 ST* vec=get_feature_vector(i, len, free_vec);
01221                 new_feat[i].string=new ST[len];
01222                 new_feat[i].length=len;
01223                 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01224                 free_feature_vector(vec, i, free_vec);
01225             }
01226 
01227             return new_feat;
01228         }
01229 
01235         virtual void get_features(T_STRING<ST>** dst, int32_t* num_str)
01236         {
01237             int32_t num_vec;
01238             int32_t max_str_len;
01239             *dst=copy_features(num_vec, max_str_len);
01240             *num_str=num_vec;
01241         }
01242 
01247         virtual inline void save(CFile* writer);
01248 
01255         virtual bool load_compressed(char* src, bool decompress)
01256         {
01257             FILE* file=NULL;
01258 
01259             if (!(file=fopen(src, "r")))
01260                 return false;
01261             cleanup();
01262 
01263             // header shogun v0
01264             char id[4];
01265             fread(&id[0], sizeof(char), 1, file);
01266             ASSERT(id[0]=='S');
01267             fread(&id[1], sizeof(char), 1, file);
01268             ASSERT(id[1]=='G');
01269             fread(&id[2], sizeof(char), 1, file);
01270             ASSERT(id[2]=='V');
01271             fread(&id[3], sizeof(char), 1, file);
01272             ASSERT(id[3]=='0');
01273 
01274             //compression type
01275             uint8_t c;
01276             fread(&c, sizeof(uint8_t), 1, file);
01277             CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01278             //alphabet
01279             uint8_t a;
01280             delete alphabet;
01281             fread(&a, sizeof(uint8_t), 1, file);
01282             alphabet=new CAlphabet((EAlphabet) a);
01283             // number of vectors
01284             fread(&num_vectors, sizeof(int32_t), 1, file);
01285             ASSERT(num_vectors>0);
01286             // maximum string length
01287             fread(&max_string_length, sizeof(int32_t), 1, file);
01288             ASSERT(max_string_length>0);
01289 
01290             features=new T_STRING<ST>[num_vectors];
01291 
01292             // vectors
01293             for (int32_t i=0; i<num_vectors; i++)
01294             {
01295                 // vector len compressed
01296                 int32_t len_compressed;
01297                 fread(&len_compressed, sizeof(int32_t), 1, file);
01298                 // vector len uncompressed
01299                 int32_t len_uncompressed;
01300                 fread(&len_uncompressed, sizeof(int32_t), 1, file);
01301 
01302                 // vector raw data
01303                 if (decompress)
01304                 {
01305                     features[i].string=new ST[len_uncompressed];
01306                     features[i].length=len_uncompressed;
01307                     uint8_t* compressed=new uint8_t[len_compressed];
01308                     fread(compressed, len_compressed, 1, file);
01309                     uint64_t uncompressed_size=len_uncompressed;
01310                     uncompressed_size*=sizeof(ST);
01311                     compressor->decompress(compressed, len_compressed,
01312                             (uint8_t*) features[i].string, uncompressed_size);
01313                     delete[] compressed;
01314                     ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01315                 }
01316                 else
01317                 {
01318                     int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01319                     features[i].string=new ST[len_compressed+offs];
01320                     features[i].length=len_compressed+offs;
01321                     int32_t* feat32ptr=((int32_t*) (features[i].string));
01322                     memset(features[i].string, 0, offs*sizeof(ST));
01323                     feat32ptr[0]=(int32_t) len_compressed;
01324                     feat32ptr[1]=(int32_t) len_uncompressed;
01325                     uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01326                     fread(compressed, len_compressed, 1, file);
01327                 }
01328             }
01329 
01330             delete compressor;
01331             fclose(file);
01332             return false;
01333         }
01334 
01342         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01343         {
01344             FILE* file=NULL;
01345 
01346             if (!(file=fopen(dest, "wb")))
01347                 return false;
01348 
01349             CCompressor* compressor= new CCompressor(compression);
01350 
01351             // header shogun v0
01352             const char* id="SGV0";
01353             fwrite(&id[0], sizeof(char), 1, file);
01354             fwrite(&id[1], sizeof(char), 1, file);
01355             fwrite(&id[2], sizeof(char), 1, file);
01356             fwrite(&id[3], sizeof(char), 1, file);
01357 
01358             //compression type
01359             uint8_t c=(uint8_t) compression;
01360             fwrite(&c, sizeof(uint8_t), 1, file);
01361             //alphabet
01362             uint8_t a=(uint8_t) alphabet->get_alphabet();
01363             fwrite(&a, sizeof(uint8_t), 1, file);
01364             // number of vectors
01365             fwrite(&num_vectors, sizeof(int32_t), 1, file);
01366             // maximum string length
01367             fwrite(&max_string_length, sizeof(int32_t), 1, file);
01368 
01369             // vectors
01370             for (int32_t i=0; i<num_vectors; i++)
01371             {
01372                 int32_t len=-1;
01373                 bool vfree;
01374                 ST* vec=get_feature_vector(i, len, vfree);
01375 
01376                 uint8_t* compressed=NULL;
01377                 uint64_t compressed_size=0;
01378 
01379                 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01380                         compressed, compressed_size, level);
01381 
01382                 int32_t len_compressed = (int32_t) compressed_size;
01383                 // vector len compressed in bytes
01384                 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01385                 // vector len uncompressed in number of elements of type ST
01386                 fwrite(&len, sizeof(int32_t), 1, file);
01387                 // vector raw data
01388                 fwrite(compressed, compressed_size, 1, file);
01389                 delete[] compressed;
01390 
01391                 free_feature_vector(vec, i, vfree);
01392             }
01393 
01394             delete compressor;
01395             fclose(file);
01396             return true;
01397         }
01398 
01399 
01404         virtual int32_t get_size() { return sizeof(ST); }
01405 
01411         virtual bool apply_preproc(bool force_preprocessing=false)
01412         {
01413             SG_DEBUG( "force: %d\n", force_preprocessing);
01414 
01415             for (int32_t i=0; i<get_num_preproc(); i++)
01416             {
01417                 if ( (!is_preprocessed(i) || force_preprocessing) )
01418                 {
01419                     set_preprocessed(i);
01420                     CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
01421                     SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01422 
01423                     if (!p->apply_to_string_features(this))
01424                     {
01425                         SG_UNREF(p);
01426                         return false;
01427                     }
01428                     else
01429                         SG_UNREF(p);
01430                 }
01431             }
01432             return true;
01433         }
01434 
01444         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01445         {
01446             ASSERT(step_size>0);
01447             ASSERT(window_size>0);
01448             ASSERT(num_vectors==1 || single_string);
01449             ASSERT(max_string_length>=window_size ||
01450                     (single_string && length_of_single_string>=window_size));
01451 
01452             //in case we are dealing with a single remapped string
01453             //allow remapping
01454             if (single_string)
01455                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01456             else if (num_vectors==1)
01457             {
01458                 num_vectors= (max_string_length-window_size)/step_size + 1;
01459                 length_of_single_string=max_string_length;
01460             }
01461 
01462             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01463             int32_t offs=0;
01464             for (int32_t i=0; i<num_vectors; i++)
01465             {
01466                 f[i].string=&features[0].string[offs+skip];
01467                 f[i].length=window_size-skip;
01468                 offs+=step_size;
01469             }
01470             single_string=features[0].string;
01471             delete[] features;
01472             features=f;
01473             max_string_length=window_size-skip;
01474 
01475             return num_vectors;
01476         }
01477 
01486         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01487         {
01488             ASSERT(positions);
01489             ASSERT(window_size>0);
01490             ASSERT(num_vectors==1 || single_string);
01491             ASSERT(max_string_length>=window_size ||
01492                     (single_string && length_of_single_string>=window_size));
01493 
01494             num_vectors= positions->get_num_elements();
01495             ASSERT(num_vectors>0);
01496 
01497             int32_t len;
01498 
01499             //in case we are dealing with a single remapped string
01500             //allow remapping
01501             if (single_string)
01502                 len=length_of_single_string;
01503             else
01504             {
01505                 single_string=features[0].string;
01506                 len=max_string_length;
01507                 length_of_single_string=max_string_length;
01508             }
01509 
01510             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01511             for (int32_t i=0; i<num_vectors; i++)
01512             {
01513                 int32_t p=positions->get_element(i);
01514 
01515                 if (p>=0 && p<=len-window_size)
01516                 {
01517                     f[i].string=&features[0].string[p+skip];
01518                     f[i].length=window_size-skip;
01519                 }
01520                 else
01521                 {
01522                     num_vectors=1;
01523                     max_string_length=len;
01524                     features[0].length=len;
01525                     single_string=NULL;
01526                     delete[] f;
01527                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01528                             window_size, i, p, len);
01529                     return -1;
01530                 }
01531             }
01532 
01533             delete[] features;
01534             features=f;
01535             max_string_length=window_size-skip;
01536 
01537             return num_vectors;
01538         }
01539 
01551         inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01552         {
01553             return obtain_from_char_features(sf, start, p_order, gap, rev);
01554         }
01555 
01565         template <class CT>
01566             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01567             {
01568                 ASSERT(sf);
01569 
01570                 CAlphabet* alpha=sf->get_alphabet();
01571                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01572 
01573                 this->order=p_order;
01574                 cleanup();
01575 
01576                 num_vectors=sf->get_num_vectors();
01577                 ASSERT(num_vectors>0);
01578                 max_string_length=sf->get_max_vector_length()-start;
01579                 features=new T_STRING<ST>[num_vectors];
01580 
01581                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01582                         alpha->get_num_symbols_in_histogram());
01583 
01584                 for (int32_t i=0; i<num_vectors; i++)
01585                 {
01586                     int32_t len=-1;
01587                     bool vfree;
01588                     CT* c=sf->get_feature_vector(i, len, vfree);
01589                     ASSERT(!vfree); // won't work when preprocessors are attached
01590 
01591                     features[i].string=new ST[len];
01592                     features[i].length=len;
01593 
01594                     ST* str=features[i].string;
01595                     for (int32_t j=0; j<len; j++)
01596                         str[j]=(ST) alpha->remap_to_bin(c[j]);
01597                 }
01598 
01599                 original_num_symbols=alpha->get_num_symbols();
01600                 int32_t max_val=alpha->get_num_bits();
01601 
01602                 SG_UNREF(alpha);
01603 
01604                 if (p_order>1)
01605                     num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01606                 else
01607                     num_symbols=original_num_symbols;
01608                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01609 
01610                 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01611                 {
01612                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01613                     return false;
01614                 }
01615 
01616                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01617                 for (int32_t line=0; line<num_vectors; line++)
01618                 {
01619                     int32_t len=0;
01620                     bool vfree;
01621                     ST* fv=get_feature_vector(line, len, vfree);
01622                     ASSERT(!vfree); // won't work when preprocessors are attached
01623 
01624                     if (rev)
01625                         CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01626                     else
01627                         CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01628 
01629                     /* fix the length of the string -- hacky */
01630                     features[line].length-=start+gap ;
01631                     if (features[line].length<0)
01632                         features[line].length=0 ;
01633                 }
01634 
01635                 compute_symbol_mask_table(max_val);
01636 
01637                 return true;
01638             }
01639 
01647         bool have_same_length(int32_t len=-1)
01648         {
01649             if (len!=-1)
01650             {
01651                 if (len!=get_max_vector_length())
01652                     return false;
01653             }
01654             len = get_max_vector_length();
01655 
01656             for (int32_t i=0; i<num_vectors; i++)
01657             {
01658                 if (get_vector_length(i)!=len)
01659                     return false;
01660             }
01661 
01662             return true;
01663         }
01664 
01669         inline void embed_features(int32_t p_order)
01670         {
01671             ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01672 
01673             order=p_order;
01674             original_num_symbols=alphabet->get_num_symbols();
01675             int32_t max_val=alphabet->get_num_bits();
01676 
01677             if (p_order>1)
01678                 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01679             else
01680                 num_symbols=original_num_symbols;
01681 
01682             SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01683 
01684             if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01685                 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01686 
01687             ST mask=0;
01688             for (int32_t i=0; i<p_order*max_val; i++)
01689                 mask= (mask<<1) | ((ST) 1);
01690 
01691             for (int32_t i=0; i<num_vectors; i++)
01692             {
01693                 int32_t len=features[i].length;
01694 
01695                 if (len < p_order)
01696                     SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01697 
01698                 ST* str = features[i].string;
01699 
01700                 // convert first word
01701                 for (int32_t j=0; j<p_order; j++)
01702                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01703                 str[0]=embed_word(&str[0], p_order);
01704 
01705                 // convert the rest
01706                 int32_t idx=0;
01707                 for (int32_t j=p_order; j<len; j++)
01708                 {
01709                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01710                     str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01711                     idx++;
01712                 }
01713 
01714                 features[i].length=len-p_order+1;
01715             }
01716 
01717             compute_symbol_mask_table(max_val);
01718         }
01719 
01724         inline void compute_symbol_mask_table(int64_t max_val)
01725         {
01726             delete[] symbol_mask_table;
01727             symbol_mask_table=new ST[256];
01728 
01729             uint64_t mask=0;
01730             for (int32_t i=0; i< (int64_t) max_val; i++)
01731                 mask=(mask<<1) | 1;
01732 
01733             for (int32_t i=0; i<256; i++)
01734             {
01735                 uint8_t bits=(uint8_t) i;
01736                 symbol_mask_table[i]=0;
01737 
01738                 for (int32_t j=0; j<8; j++)
01739                 {
01740                     if (bits & 1)
01741                         symbol_mask_table[i]|=mask<<(max_val*j);
01742 
01743                     bits>>=1;
01744                 }
01745             }
01746         }
01747 
01754         inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01755         {
01756             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01757 
01758             ST mask=0;
01759             for (int32_t i=0; i<nbits; i++)
01760                 mask=(mask<<1) | (ST) 1;
01761 
01762             for (int32_t i=0; i<len; i++)
01763             {
01764                 ST w=(word & mask);
01765                 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01766                 word>>=nbits;
01767             }
01768         }
01769 
01775         inline ST embed_word(ST* seq, int32_t len)
01776         {
01777             ST value=(ST) 0;
01778             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01779             for (int32_t i=0; i<len; i++)
01780             {
01781                 value<<=nbits;
01782                 value|=seq[i];
01783             }
01784 
01785             return value;
01786         }
01787 
01790         void determine_maximum_string_length()
01791         {
01792             max_string_length=0;
01793 
01794             for (int32_t i=0; i<num_vectors; i++)
01795                 max_string_length=CMath::max(max_string_length, features[i].length);
01796         }
01797 
01805         static ST* get_zero_terminated_string_copy(T_STRING<ST> str)
01806         {
01807             int32_t l=str.length;
01808             ST* s=new ST[l+1];
01809             memcpy(s, str.string, sizeof(ST)*l);
01810             s[l]='\0';
01811             return s;
01812         }
01813 
01820         virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01821         {
01822             ASSERT(features);
01823             ASSERT(num<num_vectors);
01824 
01825             features[num].length=len ;
01826             features[num].string=string ;
01827 
01828             max_string_length=CMath::max(len, max_string_length);
01829         }
01830 
01831 
01834         virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true)
01835         {
01836             int32_t nsym=get_num_symbols();
01837             int32_t slen=get_max_vector_length();
01838             int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01839             float64_t* h= (float64_t*) malloc(sz);
01840             ASSERT(h);
01841             memset(h, 0, sz);
01842 
01843             float64_t* h_normalizer=new float64_t[slen];
01844             memset(h_normalizer, 0, slen*sizeof(float64_t));
01845             int32_t num_str=get_num_vectors();
01846             for (int32_t i=0; i<num_str; i++)
01847             {
01848                 int32_t len;
01849                 bool free_vec;
01850                 ST* vec=get_feature_vector(i, len, free_vec);
01851                 for (int32_t j=0; j<len; j++)
01852                 {
01853                     h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01854                     h_normalizer[j]++;
01855                 }
01856                 free_feature_vector(vec, i, free_vec);
01857             }
01858 
01859             if (normalize)
01860             {
01861                 for (int32_t i=0; i<slen; i++)
01862                 {
01863                     for (int32_t j=0; j<nsym; j++)
01864                     {
01865                         if (h_normalizer && h_normalizer[i])
01866                             h[int64_t(i)*nsym+j]/=h_normalizer[i];
01867                     }
01868                 }
01869             }
01870             delete[] h_normalizer;
01871 
01872             *hist=h;
01873             *rows=nsym;
01874             *cols=slen;
01875         }
01876 
01879         virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01880         {
01881             ASSERT(rows == get_num_symbols());
01882             cleanup();
01883             float64_t* randoms=new float64_t[cols];
01884             T_STRING<ST>* sf=new T_STRING<ST>[num_vec];
01885 
01886             for (int32_t i=0; i<num_vec; i++)
01887             {
01888                 sf[i].string=new ST[cols];
01889                 sf[i].length=cols;
01890 
01891                 CMath::random_vector(randoms, cols, 0.0, 1.0);
01892 
01893                 for (int32_t j=0; j<cols; j++)
01894                 {
01895                     float64_t lik=hist[int64_t(j)*rows+0];
01896 
01897                     int32_t c;
01898                     for (c=0; c<rows-1; c++)
01899                     {
01900                         if (randoms[j]<=lik)
01901                             break;
01902                         lik+=hist[int64_t(j)*rows+c+1];
01903                     }
01904                     sf[i].string[j]=alphabet->remap_to_char(c);
01905                 }
01906             }
01907             delete[] randoms;
01908             set_features(sf, num_vec, cols);
01909         }
01910 
01912         inline virtual const char* get_name() const { return "StringFeatures"; }
01913 
01914     protected:
01915 
01926         virtual ST* compute_feature_vector(int32_t num, int32_t& len)
01927         {
01928             ASSERT(features && num<num_vectors);
01929 
01930             len=features[num].length;
01931             if (len<=0)
01932                 return NULL;
01933 
01934             ST* target=new ST[len];
01935             memcpy(target, features[num].string, len*sizeof(ST));
01936             return target;
01937         }
01938 
01939 #ifdef HAVE_BOOST_SERIALIZATION
01940     private:
01941 
01942         friend class ::boost::serialization::access;
01943         template<class Archive>
01944             void save(Archive & ar, const unsigned int archive_version) const
01945             {
01946 
01947                 SG_DEBUG("archiving StringFeatures\n");
01948 
01949                 ar & ::boost::serialization::base_object<CFeatures>(*this);
01950 
01951                 ar & alphabet;
01952 
01953                 ar & num_vectors;
01954                 for (int i=0; i < num_vectors; ++i) {
01955                     ar & features[i];
01956                 }
01957 
01958                 ar & length_of_single_string;
01959                 for (int i=0; i < length_of_single_string; ++i) {
01960                     ar & single_string[i];
01961                 }
01962 
01963                 ar & max_string_length;
01964                 ar & num_symbols;
01965                 ar & original_num_symbols;
01966                 ar & order;
01967 
01969                 //TODO?! how long
01970                 //ST* symbol_mask_table;
01971 
01972                 SG_DEBUG("done archiving StringFeatures\n");
01973 
01974             }
01975 
01976         template<class Archive>
01977             void load(Archive & ar, const unsigned int archive_version)
01978             {
01979 
01980                 SG_DEBUG("archiving StringFeatures\n");
01981 
01982                 ar & ::boost::serialization::base_object<CFeatures>(*this);
01983 
01984 
01985                 ar & alphabet;
01986 
01987                 ar & num_vectors;
01988 
01989                 //T_STRING<ST>* features = new T_STRING<ST>[num_vectors];
01990                 features = new T_STRING<ST>[num_vectors];
01991                 for (int i=0; i < num_vectors; ++i) {
01992                     ar & features[i];
01993                 }
01994 
01995 
01996                 ar & length_of_single_string;
01997 
01998                 //ST* single_string = new ST[length_of_single_string];
01999                 single_string = new ST[length_of_single_string];
02000                 for (int i=0; i < length_of_single_string; ++i) {
02001                     ar & single_string[i];
02002                 }
02003 
02004                 ar & max_string_length;
02005                 ar & num_symbols;
02006                 ar & original_num_symbols;
02007                 ar & order;
02008 
02010                 //TODO?! how long -> num_of_symbols?
02011                 //ST* symbol_mask_table;
02012 
02013                 SG_DEBUG("done archiving StringFeatures\n");
02014 
02015             }
02016 
02017         GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
02018 
02019 
02020 #endif //HAVE_BOOST_SERIALIZATION
02021 
02022 
02023     protected:
02024 
02026         CAlphabet* alphabet;
02027 
02029         int32_t num_vectors;
02030 
02032         T_STRING<ST>* features;
02033 
02035         ST* single_string;
02036 
02038         int32_t length_of_single_string;
02039 
02041         int32_t max_string_length;
02042 
02044         floatmax_t num_symbols;
02045 
02047         floatmax_t original_num_symbols;
02048 
02050         int32_t order;
02051 
02053         ST* symbol_mask_table;
02054 
02056         bool preprocess_on_get;
02057 
02059         CCache<ST>* feature_cache;
02060 };
02061 
02062 #ifndef DOXYGEN_SHOULD_SKIP_THIS
02063 
02067 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
02068 {
02069     return F_BOOL;
02070 }
02071 
02076 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
02077 {
02078     return F_CHAR;
02079 }
02080 
02085 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
02086 {
02087     return F_BYTE;
02088 }
02089 
02094 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
02095 {
02096     return F_SHORT;
02097 }
02098 
02103 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
02104 {
02105     return F_WORD;
02106 }
02107 
02112 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
02113 {
02114     return F_INT;
02115 }
02116 
02121 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
02122 {
02123     return F_UINT;
02124 }
02125 
02130 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
02131 {
02132     return F_LONG;
02133 }
02134 
02139 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
02140 {
02141     return F_ULONG;
02142 }
02143 
02148 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
02149 {
02150     return F_SHORTREAL;
02151 }
02152 
02157 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
02158 {
02159     return F_DREAL;
02160 }
02161 
02166 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
02167 {
02168     return F_LONGREAL;
02169 }
02170 
02171 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
02172 {
02173     return symbol;
02174 }
02175 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
02176 {
02177     return symbol;
02178 }
02179 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
02180 {
02181     return symbol;
02182 }
02183 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
02184 {
02185     return symbol;
02186 }
02187 
02188 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02189 {
02190     return false;
02191 }
02192 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02193 {
02194     return 0;
02195 }
02196 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02197 {
02198     return 0;
02199 }
02200 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02201 {
02202     return 0;
02203 }
02204 
02205 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02206 {
02207     return symbol;
02208 }
02209 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02210 {
02211     return symbol;
02212 }
02213 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02214 {
02215     return symbol;
02216 }
02217 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02218 {
02219     return symbol;
02220 }
02221 
02222 #ifndef SUNOS
02223 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02224 {
02225     return false;
02226 }
02227 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02228 {
02229     return false;
02230 }
02231 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02232 {
02233     return false;
02234 }
02235 #endif
02236 
02237 template<>  inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02238 {
02239 }
02240 template<>  inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02241 {
02242 }
02243 template<>  inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02244 {
02245 }
02246 
02247 template<>  inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02248 {
02249 }
02250 template<>  inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02251 {
02252 }
02253 template<>  inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02254 {
02255 }
02256 
02257 template<>  inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02258 {
02259     return 0;
02260 }
02261 template<>  inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02262 {
02263     return 0;
02264 }
02265 template<>  inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02266 {
02267     return 0;
02268 }
02269 
02270 template<>  inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02271 {
02272 }
02273 template<>  inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02274 {
02275 }
02276 template<>  inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02277 {
02278 }
02279 #define LOAD(f_load, sg_type)                                               \
02280 template<> inline void CStringFeatures<sg_type>::load(CFile* loader)        \
02281 {                                                                           \
02282     SG_INFO( "loading...\n");                                               \
02283                                                                             \
02284     T_STRING<sg_type>* strs;                                                \
02285     int32_t num_str;                                                        \
02286     int32_t max_len;                                                        \
02287     loader->f_load(strs, num_str, max_len);                                 \
02288     set_features(strs, num_str, max_len);                                   \
02289 }
02290 
02291 LOAD(get_bool_string_list, bool)
02292 LOAD(get_char_string_list, char)
02293 LOAD(get_byte_string_list, uint8_t)
02294 LOAD(get_short_string_list, int16_t)
02295 LOAD(get_word_string_list, uint16_t)
02296 LOAD(get_int_string_list, int32_t)
02297 LOAD(get_uint_string_list, uint32_t)
02298 LOAD(get_long_string_list, int64_t)
02299 LOAD(get_ulong_string_list, uint64_t)
02300 LOAD(get_shortreal_string_list, float32_t)
02301 LOAD(get_real_string_list, float64_t)
02302 LOAD(get_longreal_string_list, floatmax_t)
02303 #undef LOAD
02304 
02305 #define SAVE(f_write, sg_type)                                              \
02306 template<> inline void CStringFeatures<sg_type>::save(CFile* writer)        \
02307 {                                                                           \
02308     ASSERT(writer);                                                         \
02309     writer->f_write(features, num_vectors);                                 \
02310 }
02311 
02312 SAVE(set_bool_string_list, bool)
02313 SAVE(set_char_string_list, char)
02314 SAVE(set_byte_string_list, uint8_t)
02315 SAVE(set_short_string_list, int16_t)
02316 SAVE(set_word_string_list, uint16_t)
02317 SAVE(set_int_string_list, int32_t)
02318 SAVE(set_uint_string_list, uint32_t)
02319 SAVE(set_long_string_list, int64_t)
02320 SAVE(set_ulong_string_list, uint64_t)
02321 SAVE(set_shortreal_string_list, float32_t)
02322 SAVE(set_real_string_list, float64_t)
02323 SAVE(set_longreal_string_list, floatmax_t)
02324 #undef SAVE
02325 #endif // DOXYGEN_SHOULD_SKIP_THIS
02326 }
02327 #endif // _CSTRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation