BitString.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society
00009  */
00010 #ifndef __BITSTRING_H__
00011 #define __BITSTRING_H__
00012 
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/common.h>
00015 #include <shogun/lib/io.h>
00016 #include <shogun/lib/MemoryMappedFile.h>
00017 #include <shogun/lib/Mathematics.h>
00018 
00019 namespace shogun
00020 {
00028 class CBitString : public CSGObject
00029 {
00030     public:
00038         CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
00039         {
00040             alphabet=new CAlphabet(alpha);
00041             int32_t nbits=alphabet->get_num_bits();
00042             word_len = width*nbits;
00043 
00044             mask=0;
00045             for (int32_t j=0; j<word_len; j++)
00046                 mask=(mask<<1) | (uint64_t) 1;
00047             mask<<=sizeof(uint64_t)*8-word_len;
00048 
00049             single_mask=0;
00050             for (int32_t j=0; j<nbits; j++)
00051                 mask=(mask<<1) | (uint64_t) 1;
00052         }
00053 
00055         ~CBitString()
00056         {
00057             cleanup();
00058             SG_UNREF(alphabet);
00059         }
00060 
00062         void cleanup()
00063         {
00064             delete[] string;
00065             string=NULL;
00066             length=0;
00067         }
00068 
00074         void obtain_from_char(char* str, uint64_t len)
00075         {
00076             cleanup();
00077             uint64_t stream_len=len/sizeof(uint64_t)+1;
00078             string=new uint64_t[stream_len];
00079             length=len;
00080 
00081             uint64_t w=0;
00082             int32_t nbits=alphabet->get_num_bits();
00083             uint64_t j=0;
00084             int32_t nfit=8*sizeof(w)/nbits;
00085             for (uint64_t i=0; i<len; i++)
00086             {
00087                 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
00088 
00089                 if (i % nfit == nfit-1)
00090                 {
00091                     string[j]=w;
00092                     j++;
00093                 }
00094             }
00095 
00096             if (j<stream_len)
00097             {
00098                 string[j]=w;
00099                 j++;
00100             }
00101 
00102             ASSERT(j==stream_len);
00103         }
00104 
00111         void load_fasta_file(const char* fname, bool ignore_invalid=false)
00112         {
00113             cleanup();
00114 
00115             uint64_t len=0;
00116             uint64_t offs=0;
00117 
00118             CMemoryMappedFile<char> f(fname);
00119 
00120             uint64_t id_len=0;
00121             char* id=f.get_line(id_len, offs);
00122 
00123             if (!id_len || id[0]!='>')
00124                 SG_SERROR("No fasta hunks (lines starting with '>') found\n");
00125 
00126             if (offs==f.get_size())
00127                 SG_SERROR("Empty file?\n");
00128 
00129             char* fasta=NULL;
00130             char* s=NULL;
00131             int32_t spanned_lines=0;
00132             int32_t fasta_len=0;
00133 
00134             while (true)
00135             {
00136                 s=f.get_line(len, offs);
00137 
00138                 if (!fasta)
00139                     fasta=s;
00140 
00141                 if (!s || len==0)
00142                     SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len);
00143 
00144                 if (s[0]=='>')
00145                     SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n");
00146 
00147                 if (offs==f.get_size())
00148                 {
00149                     offs-=len+1; // seek to beginning
00150                     fasta_len+=len;
00151 
00152                     uint64_t w=0;
00153                     int32_t nbits=alphabet->get_num_bits();
00154                     uint64_t nfit=8*sizeof(w)/nbits;
00155 
00156                     len = fasta_len-spanned_lines;
00157                     uint64_t stream_len=len/(nfit)+1;
00158                     string=new uint64_t[stream_len];
00159                     length=len;
00160 
00161                     int32_t idx=0;
00162                     int32_t k=0;
00163 
00164                     for (int32_t j=0; j<fasta_len; j++, k++)
00165                     {
00166                         if (fasta[j]=='\n')
00167                         {
00168                             k--;
00169                             continue;
00170                         }
00171 
00172                         w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
00173 
00174                         if (k % nfit == nfit-1)
00175                         {
00176                             string[idx]=w;
00177                             w=0;
00178                             idx++;
00179                         }
00180                     }
00181 
00182                     if (idx<stream_len)
00183                         string[idx]=w<<(nbits*(nfit - k%nfit));
00184 
00185                     break;
00186                 }
00187 
00188                 spanned_lines++;
00189                 fasta_len+=len+1; // including '\n'
00190             }
00191         }
00192 
00198         void set_string(uint64_t* str, uint64_t len)
00199         {
00200             cleanup();
00201             string=str;
00202             length=len;
00203         }
00204 
00209         void create(uint64_t len)
00210         {
00211             cleanup();
00212             uint64_t stream_len=len/sizeof(uint64_t)+1;
00213             string=new uint64_t[stream_len];
00214             CMath::fill_vector(string, (int32_t) stream_len, (uint64_t) 0);
00215             length=len;
00216         }
00217 
00218         /*
00219         inline uint64_t condense(uint64_t bits, uint64_t mask) const
00220             {
00221                 uint64_t res = 0 ;
00222                 uint64_t m=mask ;
00223                 uint64_t b=bits ;
00224                 char cnt=0 ;
00225                 
00226                 char ar[256][256] ;
00227 
00228                 for (int i=0; i<8; i++)
00229                 {
00230                     //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ;
00231                     if (m&1)
00232                         res = res>>8 | ar[b&255][m&255] ;
00233                     //else
00234                     //  cnt++ ;
00235                     m=m>>8 ;
00236                     b=b>>8 ;
00237                 }
00238                 res=res>>cnt ;
00239                 //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ;
00240                 
00241                 //res = res & bits & mask ;
00242                 
00243                 return res ;
00244             }
00245         */
00246 
00251         inline uint64_t operator[](uint64_t index) const
00252         {
00253             ASSERT(index<length);
00254 
00255             uint64_t bitindex=alphabet->get_num_bits()*index;
00256             int32_t ws=8*sizeof(uint64_t);
00257             uint64_t i=bitindex/ws;
00258             int32_t j=bitindex % ws;
00259             int32_t missing=word_len-(ws-j);
00260 
00261             //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len);
00262             uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
00263 
00264             if (missing>0)
00265                 res|= ( string[i+1] >> (ws-missing) );
00266 
00267             //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ;
00268 
00269             return res;
00270         }
00271 
00277         inline void set_binary_word(uint16_t word, uint64_t index)
00278         {
00279             ASSERT(index<length);
00280 
00281             uint64_t bitindex=alphabet->get_num_bits()*index;
00282             int32_t ws=8*sizeof(uint64_t);
00283             uint64_t i=bitindex/ws;
00284             int32_t j=bitindex % ws;
00285             int32_t missing=word_len-(ws-j);
00286 
00287             uint64_t sl = j-word_len;
00288             uint64_t ml;
00289             uint64_t wl;
00290             uint64_t mr;
00291             uint64_t wr;
00292 
00293             if (sl>0)
00294             {
00295                 ml=mask<<sl;
00296                 wl=word<<sl ;
00297             }
00298             else
00299             {
00300                 sl=-sl;
00301                 ml=mask>>sl;
00302                 wl=word>>sl;
00303             }
00304 
00305             string[i] = ( string[i] & (~ml) ) | ( wl & ml);
00306 
00307             if (missing>0)
00308             {
00309                 mr = mask<<missing;
00310                 wr = word<<missing;
00311                 string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr);
00312             }
00313 
00314         }
00315 
00317         inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
00318 
00320         inline virtual const char* get_name() const { return "BitString"; }
00321 
00322     private:
00324         CAlphabet* alphabet;
00326         uint64_t* string;
00328         uint64_t length;
00330         int32_t word_len;
00332         uint64_t mask;
00334         uint64_t single_mask;
00335 };
00336 }
00337 #endif //__BITSTRING_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation