BitString.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef __BITSTRING_H__
00011 #define __BITSTRING_H__
00012
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/common.h>
00015 #include <shogun/lib/io.h>
00016 #include <shogun/lib/MemoryMappedFile.h>
00017 #include <shogun/lib/Mathematics.h>
00018
00026 class CBitString : public CSGObject
00027 {
00028 public:
00036 CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
00037 {
00038 alphabet=new CAlphabet(alpha);
00039 int32_t nbits=alphabet->get_num_bits();
00040 word_len=width*nbits;
00041 mask=0;
00042
00043 for (int32_t j=0; j<word_len; j++)
00044 mask=(mask<<1) | (uint64_t) 1;
00045 mask<<=sizeof(uint64_t)*8-word_len;
00046 }
00047
00049 ~CBitString()
00050 {
00051 cleanup();
00052 SG_UNREF(alphabet);
00053 }
00054
00056 void cleanup()
00057 {
00058 delete[] string;
00059 string=NULL;
00060 length=0;
00061 }
00062
00068 void obtain_from_char(char* str, uint64_t len)
00069 {
00070 cleanup();
00071 uint64_t stream_len=len/sizeof(uint64_t)+1;
00072 string=new uint64_t[stream_len];
00073 length=len;
00074
00075 uint64_t w=0;
00076 int32_t nbits=alphabet->get_num_bits();
00077 uint64_t j=0;
00078 int32_t nfit=8*sizeof(w)/nbits;
00079 for (uint64_t i=0; i<len; i++)
00080 {
00081 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
00082
00083 if (i % nfit == nfit-1)
00084 {
00085 string[j]=w;
00086 j++;
00087 }
00088 }
00089
00090 if (j<stream_len)
00091 {
00092 string[j]=w;
00093 j++;
00094 }
00095
00096 ASSERT(j==stream_len);
00097 }
00098
00105 void load_fasta_file(const char* fname, bool ignore_invalid=false)
00106 {
00107 cleanup();
00108
00109 uint64_t len=0;
00110 uint64_t offs=0;
00111
00112 CMemoryMappedFile<char> f(fname);
00113
00114 uint64_t id_len=0;
00115 char* id=f.get_line(id_len, offs);
00116
00117 if (!id_len || id[0]!='>')
00118 SG_SERROR("No fasta hunks (lines starting with '>') found\n");
00119
00120 if (offs==f.get_size())
00121 SG_SERROR("Empty file?\n");
00122
00123 char* fasta=NULL;
00124 char* s=NULL;
00125 int32_t spanned_lines=0;
00126 int32_t fasta_len=0;
00127
00128 while (true)
00129 {
00130 s=f.get_line(len, offs);
00131
00132 if (!fasta)
00133 fasta=s;
00134
00135 if (!s || len==0)
00136 SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len);
00137
00138 if (s[0]=='>')
00139 SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n");
00140
00141 if (offs==f.get_size())
00142 {
00143 offs-=len+1;
00144 fasta_len+=len;
00145
00146 uint64_t w=0;
00147 int32_t nbits=alphabet->get_num_bits();
00148 uint64_t nfit=8*sizeof(w)/nbits;
00149
00150 len = fasta_len-spanned_lines;
00151 uint64_t stream_len=len/(nfit)+1;
00152 string=new uint64_t[stream_len];
00153 length=len;
00154
00155 int32_t idx=0;
00156 int32_t k=0;
00157
00158 for (int32_t j=0; j<fasta_len; j++, k++)
00159 {
00160 if (fasta[j]=='\n')
00161 {
00162 k--;
00163 continue;
00164 }
00165
00166 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
00167
00168 if (k % nfit == nfit-1)
00169 {
00170 string[idx]=w;
00171 w=0;
00172 idx++;
00173 }
00174 }
00175
00176 if (idx<stream_len)
00177 string[idx]=w<<(nbits*(nfit - k%nfit));
00178
00179 break;
00180 }
00181
00182 spanned_lines++;
00183 fasta_len+=len+1;
00184 }
00185 }
00186
00192 void set_string(uint64_t* str, uint64_t len)
00193 {
00194 cleanup();
00195 string=str;
00196 length=len;
00197 }
00198
00199 inline uint64_t condense(uint64_t bits, uint64_t mask) const
00200 {
00201 uint64_t res = 0 ;
00202 uint64_t m=mask ;
00203 uint64_t b=bits ;
00204 char cnt=0 ;
00205
00206 char ar[256][256] ;
00207
00208 for (int i=0; i<8; i++)
00209 {
00210
00211 if (m&1)
00212 res = res>>8 | ar[b&255][m&255] ;
00213
00214
00215 m=m>>8 ;
00216 b=b>>8 ;
00217 }
00218 res=res>>cnt ;
00219
00220
00221
00222
00223 return res ;
00224 }
00225
00226
00227 inline uint64_t operator[](uint64_t index) const
00228 {
00229 ASSERT(index<length);
00230
00231 uint64_t bitindex=alphabet->get_num_bits()*index;
00232 int32_t ws=8*sizeof(uint64_t);
00233 uint64_t i=bitindex/ws;
00234 int32_t j=bitindex % ws;
00235 int32_t missing=word_len-(ws-j);
00236
00237
00238 uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
00239
00240 if (missing>0)
00241 res|= ( string[i+1] >> (ws-missing) );
00242
00243
00244
00245 return res;
00246 }
00247
00248 inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
00249
00251 inline virtual const char* get_name() const { return "BitString"; }
00252
00253 private:
00255 CAlphabet* alphabet;
00257 uint64_t* string;
00259 uint64_t length;
00261 int32_t word_len;
00263 uint64_t mask;
00264 };
00265 #endif //__BITSTRING_H__