BitString.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society
00009  */
00010 #ifndef __BITSTRING_H__
00011 #define __BITSTRING_H__
00012 
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/common.h>
00015 #include <shogun/lib/io.h>
00016 #include <shogun/lib/MemoryMappedFile.h>
00017 #include <shogun/lib/Mathematics.h>
00018 
00026 class CBitString : public CSGObject
00027 {
00028     public:
00036         CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
00037         {
00038             alphabet=new CAlphabet(alpha);
00039             int32_t nbits=alphabet->get_num_bits();
00040             word_len=width*nbits;
00041             mask=0;
00042 
00043             for (int32_t j=0; j<word_len; j++)
00044                 mask=(mask<<1) | (uint64_t) 1;
00045             mask<<=sizeof(uint64_t)*8-word_len;
00046         }
00047 
00049         ~CBitString()
00050         {
00051             cleanup();
00052             SG_UNREF(alphabet);
00053         }
00054 
00056         void cleanup()
00057         {
00058             delete[] string;
00059             string=NULL;
00060             length=0;
00061         }
00062 
00068         void obtain_from_char(char* str, uint64_t len)
00069         {
00070             cleanup();
00071             uint64_t stream_len=len/sizeof(uint64_t)+1;
00072             string=new uint64_t[stream_len];
00073             length=len;
00074 
00075             uint64_t w=0;
00076             int32_t nbits=alphabet->get_num_bits();
00077             uint64_t j=0;
00078             int32_t nfit=8*sizeof(w)/nbits;
00079             for (uint64_t i=0; i<len; i++)
00080             {
00081                 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
00082 
00083                 if (i % nfit == nfit-1)
00084                 {
00085                     string[j]=w;
00086                     j++;
00087                 }
00088             }
00089 
00090             if (j<stream_len)
00091             {
00092                 string[j]=w;
00093                 j++;
00094             }
00095 
00096             ASSERT(j==stream_len);
00097         }
00098 
00105         void load_fasta_file(const char* fname, bool ignore_invalid=false)
00106         {
00107             cleanup();
00108 
00109             uint64_t len=0;
00110             uint64_t offs=0;
00111 
00112             CMemoryMappedFile<char> f(fname);
00113 
00114             uint64_t id_len=0;
00115             char* id=f.get_line(id_len, offs);
00116 
00117             if (!id_len || id[0]!='>')
00118                 SG_SERROR("No fasta hunks (lines starting with '>') found\n");
00119 
00120             if (offs==f.get_size())
00121                 SG_SERROR("Empty file?\n");
00122 
00123             char* fasta=NULL;
00124             char* s=NULL;
00125             int32_t spanned_lines=0;
00126             int32_t fasta_len=0;
00127 
00128             while (true)
00129             {
00130                 s=f.get_line(len, offs);
00131 
00132                 if (!fasta)
00133                     fasta=s;
00134 
00135                 if (!s || len==0)
00136                     SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len);
00137 
00138                 if (s[0]=='>')
00139                     SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n");
00140 
00141                 if (offs==f.get_size())
00142                 {
00143                     offs-=len+1; // seek to beginning
00144                     fasta_len+=len;
00145 
00146                     uint64_t w=0;
00147                     int32_t nbits=alphabet->get_num_bits();
00148                     uint64_t nfit=8*sizeof(w)/nbits;
00149 
00150                     len = fasta_len-spanned_lines;
00151                     uint64_t stream_len=len/(nfit)+1;
00152                     string=new uint64_t[stream_len];
00153                     length=len;
00154 
00155                     int32_t idx=0;
00156                     int32_t k=0;
00157 
00158                     for (int32_t j=0; j<fasta_len; j++, k++)
00159                     {
00160                         if (fasta[j]=='\n')
00161                         {
00162                             k--;
00163                             continue;
00164                         }
00165 
00166                         w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
00167 
00168                         if (k % nfit == nfit-1)
00169                         {
00170                             string[idx]=w;
00171                             w=0;
00172                             idx++;
00173                         }
00174                     }
00175 
00176                     if (idx<stream_len)
00177                         string[idx]=w<<(nbits*(nfit - k%nfit));
00178 
00179                     break;
00180                 }
00181 
00182                 spanned_lines++;
00183                 fasta_len+=len+1; // including '\n'
00184             }
00185         }
00186 
00192         void set_string(uint64_t* str, uint64_t len)
00193         {
00194             cleanup();
00195             string=str;
00196             length=len;
00197         }
00198 
00199         inline uint64_t condense(uint64_t bits, uint64_t mask) const
00200             {
00201                 uint64_t res = 0 ;
00202                 uint64_t m=mask ;
00203                 uint64_t b=bits ;
00204                 char cnt=0 ;
00205                 
00206                 char ar[256][256] ;
00207 
00208                 for (int i=0; i<8; i++)
00209                 {
00210                     //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ;
00211                     if (m&1)
00212                         res = res>>8 | ar[b&255][m&255] ;
00213                     //else
00214                     //  cnt++ ;
00215                     m=m>>8 ;
00216                     b=b>>8 ;
00217                 }
00218                 res=res>>cnt ;
00219                 //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ;
00220                 
00221                 //res = res & bits & mask ;
00222                 
00223                 return res ;
00224             }
00225         
00226 
00227         inline uint64_t operator[](uint64_t index) const
00228         {
00229             ASSERT(index<length);
00230 
00231             uint64_t bitindex=alphabet->get_num_bits()*index;
00232             int32_t ws=8*sizeof(uint64_t);
00233             uint64_t i=bitindex/ws;
00234             int32_t j=bitindex % ws;
00235             int32_t missing=word_len-(ws-j);
00236 
00237             //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len);
00238             uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
00239 
00240             if (missing>0)
00241                 res|= ( string[i+1] >> (ws-missing) );
00242 
00243             //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ;
00244 
00245             return res;
00246         }
00247 
00248         inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
00249 
00251         inline virtual const char* get_name() const { return "BitString"; }
00252 
00253     private:
00255         CAlphabet* alphabet;
00257         uint64_t* string;
00259         uint64_t length;
00261         int32_t word_len;
00263         uint64_t mask;
00264 };
00265 #endif //__BITSTRING_H__

SHOGUN Machine Learning Toolbox - Documentation