00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <string.h>
00012 #include <math.h>
00013
00014 #include "features/Alphabet.h"
00015 #include "lib/io.h"
00016
00017
00018 #ifdef HAVE_BOOST_SERIALIZATION
00019 #include <boost/serialization/export.hpp>
00020 BOOST_CLASS_EXPORT(shogun::CAlphabet);
00021 #endif //HAVE_BOOST_SERIALIZATION
00022
00023
00024 using namespace shogun;
00025
00026
00027 const uint8_t CAlphabet::B_A=0;
00028 const uint8_t CAlphabet::B_C=1;
00029 const uint8_t CAlphabet::B_G=2;
00030 const uint8_t CAlphabet::B_T=3;
00031 const uint8_t CAlphabet::B_0=4;
00032 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00033 const char* CAlphabet::alphabet_names[18]={
00034 "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
00035 "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
00036 "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
00037 "SNP", "RAWSNP"};
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047 CAlphabet::CAlphabet(char* al, int32_t len)
00048 : CSGObject()
00049 {
00050 EAlphabet alpha=NONE;
00051
00052 if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00053 alpha = DNA;
00054 else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00055 alpha = RAWDNA;
00056 else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00057 alpha = RNA;
00058 else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00059 alpha = PROTEIN;
00060 else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00061 alpha = BINARY;
00062 else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00063 alpha = ALPHANUM;
00064 else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00065 alpha = CUBE;
00066 else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
00067 alpha = DIGIT2;
00068 else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
00069 alpha = DIGIT;
00070 else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
00071 alpha = RAWDIGIT2;
00072 else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
00073 alpha = RAWDIGIT;
00074 else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
00075 alpha = SNP;
00076 else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
00077 alpha = RAWSNP;
00078 else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00079 (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00080 alpha = RAWBYTE;
00081 else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00082 alpha = IUPAC_NUCLEIC_ACID;
00083 else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00084 alpha = IUPAC_AMINO_ACID;
00085 else {
00086 SG_ERROR( "unknown alphabet %s\n", al);
00087 }
00088
00089 set_alphabet(alpha);
00090 }
00091
00092 CAlphabet::CAlphabet(EAlphabet alpha)
00093 : CSGObject()
00094 {
00095 set_alphabet(alpha);
00096 }
00097
00098 CAlphabet::CAlphabet(CAlphabet* a)
00099 : CSGObject()
00100 {
00101 ASSERT(a);
00102 set_alphabet(a->get_alphabet());
00103 copy_histogram(a);
00104 }
00105
00106 CAlphabet::~CAlphabet()
00107 {
00108 }
00109
00110 bool CAlphabet::set_alphabet(EAlphabet alpha)
00111 {
00112 bool result=true;
00113 alphabet=alpha;
00114
00115 switch (alphabet)
00116 {
00117 case DNA:
00118 case RAWDNA:
00119 num_symbols = 4;
00120 break;
00121 case RNA:
00122 num_symbols = 4;
00123 break;
00124 case PROTEIN:
00125 num_symbols = 26;
00126 break;
00127 case BINARY:
00128 num_symbols = 2;
00129 break;
00130 case ALPHANUM:
00131 num_symbols = 36;
00132 break;
00133 case CUBE:
00134 num_symbols = 6;
00135 break;
00136 case RAWBYTE:
00137 num_symbols = 256;
00138 break;
00139 case IUPAC_NUCLEIC_ACID:
00140 num_symbols = 16;
00141 break;
00142 case IUPAC_AMINO_ACID:
00143 num_symbols = 23;
00144 break;
00145 case NONE:
00146 num_symbols = 0;
00147 break;
00148 case DIGIT2:
00149 num_symbols = 3;
00150 break;
00151 case DIGIT:
00152 num_symbols = 10;
00153 break;
00154 case RAWDIGIT2:
00155 num_symbols = 3;
00156 break;
00157 case RAWDIGIT:
00158 num_symbols = 10;
00159 break;
00160 case SNP:
00161 num_symbols = 5;
00162 break;
00163 case RAWSNP:
00164 num_symbols = 5;
00165 break;
00166 default:
00167 num_symbols = 0;
00168 result=false;
00169 break;
00170 }
00171
00172 num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00173 init_map_table();
00174 clear_histogram();
00175
00176 SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00177
00178 return result;
00179 }
00180
00181 void CAlphabet::init_map_table()
00182 {
00183 for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00184 {
00185 maptable_to_bin[i] = MAPTABLE_UNDEF;
00186 maptable_to_char[i] = MAPTABLE_UNDEF;
00187 valid_chars[i] = false;
00188 }
00189
00190 switch (alphabet)
00191 {
00192 case RAWDIGIT:
00193 for (uint8_t i=0; i<=9; i++)
00194 {
00195 valid_chars[i]=true;
00196 maptable_to_bin[i]=i;
00197 maptable_to_char[i]=i;
00198 }
00199 break;
00200
00201 case RAWDIGIT2:
00202 for (uint8_t i=0; i<=2; i++)
00203 {
00204 valid_chars[i]=true;
00205 maptable_to_bin[i]=i;
00206 maptable_to_char[i]=i;
00207 }
00208 break;
00209
00210 case DIGIT:
00211 valid_chars[(uint8_t) '0']=true;
00212 valid_chars[(uint8_t) '1']=true;
00213 valid_chars[(uint8_t) '2']=true;
00214 valid_chars[(uint8_t) '3']=true;
00215 valid_chars[(uint8_t) '4']=true;
00216 valid_chars[(uint8_t) '5']=true;
00217 valid_chars[(uint8_t) '6']=true;
00218 valid_chars[(uint8_t) '7']=true;
00219 valid_chars[(uint8_t) '8']=true;
00220 valid_chars[(uint8_t) '9']=true;
00221
00222 maptable_to_bin[(uint8_t) '0']=0;
00223 maptable_to_bin[(uint8_t) '1']=1;
00224 maptable_to_bin[(uint8_t) '2']=2;
00225 maptable_to_bin[(uint8_t) '3']=3;
00226 maptable_to_bin[(uint8_t) '4']=4;
00227 maptable_to_bin[(uint8_t) '5']=5;
00228 maptable_to_bin[(uint8_t) '6']=6;
00229 maptable_to_bin[(uint8_t) '7']=7;
00230 maptable_to_bin[(uint8_t) '8']=8;
00231 maptable_to_bin[(uint8_t) '9']=9;
00232
00233 maptable_to_char[(uint8_t) 0]='0';
00234 maptable_to_char[(uint8_t) 1]='1';
00235 maptable_to_char[(uint8_t) 2]='2';
00236 maptable_to_char[(uint8_t) 3]='3';
00237 maptable_to_char[(uint8_t) 4]='4';
00238 maptable_to_char[(uint8_t) 5]='5';
00239 maptable_to_char[(uint8_t) 6]='6';
00240 maptable_to_char[(uint8_t) 7]='7';
00241 maptable_to_char[(uint8_t) 8]='8';
00242 maptable_to_char[(uint8_t) 9]='9';
00243 break;
00244
00245 case DIGIT2:
00246 valid_chars[(uint8_t) '0']=true;
00247 valid_chars[(uint8_t) '1']=true;
00248 valid_chars[(uint8_t) '2']=true;
00249
00250 maptable_to_bin[(uint8_t) '0']=0;
00251 maptable_to_bin[(uint8_t) '1']=1;
00252 maptable_to_bin[(uint8_t) '2']=2;
00253
00254 maptable_to_char[(uint8_t) 0]='0';
00255 maptable_to_char[(uint8_t) 1]='1';
00256 maptable_to_char[(uint8_t) 2]='2';
00257 break;
00258
00259 case CUBE:
00260 valid_chars[(uint8_t) '1']=true;
00261 valid_chars[(uint8_t) '2']=true;
00262 valid_chars[(uint8_t) '3']=true;
00263 valid_chars[(uint8_t) '4']=true;
00264 valid_chars[(uint8_t) '5']=true;
00265 valid_chars[(uint8_t) '6']=true;
00266
00267 maptable_to_bin[(uint8_t) '1']=0;
00268 maptable_to_bin[(uint8_t) '2']=1;
00269 maptable_to_bin[(uint8_t) '3']=2;
00270 maptable_to_bin[(uint8_t) '4']=3;
00271 maptable_to_bin[(uint8_t) '5']=4;
00272 maptable_to_bin[(uint8_t) '6']=5;
00273
00274 maptable_to_char[(uint8_t) 0]='1';
00275 maptable_to_char[(uint8_t) 1]='2';
00276 maptable_to_char[(uint8_t) 2]='3';
00277 maptable_to_char[(uint8_t) 3]='4';
00278 maptable_to_char[(uint8_t) 4]='5';
00279 maptable_to_char[(uint8_t) 5]='6';
00280 break;
00281
00282 case PROTEIN:
00283 {
00284 int32_t skip=0 ;
00285 for (int32_t i=0; i<21; i++)
00286 {
00287 if (i==1) skip++ ;
00288 if (i==8) skip++ ;
00289 if (i==12) skip++ ;
00290 if (i==17) skip++ ;
00291 valid_chars['A'+i+skip]=true;
00292 maptable_to_bin['A'+i+skip]=i ;
00293 maptable_to_char[i]='A'+i+skip ;
00294 } ;
00295 } ;
00296 break;
00297
00298 case BINARY:
00299 valid_chars[(uint8_t) '0']=true;
00300 valid_chars[(uint8_t) '1']=true;
00301
00302 maptable_to_bin[(uint8_t) '0']=0;
00303 maptable_to_bin[(uint8_t) '1']=1;
00304
00305 maptable_to_char[0]=(uint8_t) '0';
00306 maptable_to_char[1]=(uint8_t) '1';
00307 break;
00308
00309 case ALPHANUM:
00310 {
00311 for (int32_t i=0; i<26; i++)
00312 {
00313 valid_chars['A'+i]=true;
00314 maptable_to_bin['A'+i]=i ;
00315 maptable_to_char[i]='A'+i ;
00316 } ;
00317 for (int32_t i=0; i<10; i++)
00318 {
00319 valid_chars['0'+i]=true;
00320 maptable_to_bin['0'+i]=26+i ;
00321 maptable_to_char[26+i]='0'+i ;
00322 } ;
00323 } ;
00324 break;
00325
00326 case RAWBYTE:
00327 {
00328
00329 for (int32_t i=0; i<256; i++)
00330 {
00331 valid_chars[i]=true;
00332 maptable_to_bin[i]=i;
00333 maptable_to_char[i]=i;
00334 }
00335 }
00336 break;
00337
00338 case DNA:
00339 valid_chars[(uint8_t) 'A']=true;
00340 valid_chars[(uint8_t) 'C']=true;
00341 valid_chars[(uint8_t) 'G']=true;
00342 valid_chars[(uint8_t) 'T']=true;
00343
00344 maptable_to_bin[(uint8_t) 'A']=B_A;
00345 maptable_to_bin[(uint8_t) 'C']=B_C;
00346 maptable_to_bin[(uint8_t) 'G']=B_G;
00347 maptable_to_bin[(uint8_t) 'T']=B_T;
00348
00349 maptable_to_char[B_A]='A';
00350 maptable_to_char[B_C]='C';
00351 maptable_to_char[B_G]='G';
00352 maptable_to_char[B_T]='T';
00353 break;
00354 case RAWDNA:
00355 {
00356
00357 for (int32_t i=0; i<4; i++)
00358 {
00359 valid_chars[i]=true;
00360 maptable_to_bin[i]=i;
00361 maptable_to_char[i]=i;
00362 }
00363 }
00364 break;
00365
00366 case SNP:
00367 valid_chars[(uint8_t) 'A']=true;
00368 valid_chars[(uint8_t) 'C']=true;
00369 valid_chars[(uint8_t) 'G']=true;
00370 valid_chars[(uint8_t) 'T']=true;
00371 valid_chars[(uint8_t) '0']=true;
00372
00373 maptable_to_bin[(uint8_t) 'A']=B_A;
00374 maptable_to_bin[(uint8_t) 'C']=B_C;
00375 maptable_to_bin[(uint8_t) 'G']=B_G;
00376 maptable_to_bin[(uint8_t) 'T']=B_T;
00377 maptable_to_bin[(uint8_t) '0']=B_0;
00378
00379 maptable_to_char[B_A]='A';
00380 maptable_to_char[B_C]='C';
00381 maptable_to_char[B_G]='G';
00382 maptable_to_char[B_T]='T';
00383 maptable_to_char[B_0]='0';
00384 break;
00385 case RAWSNP:
00386 {
00387
00388 for (int32_t i=0; i<5; i++)
00389 {
00390 valid_chars[i]=true;
00391 maptable_to_bin[i]=i;
00392 maptable_to_char[i]=i;
00393 }
00394 }
00395 break;
00396
00397 case RNA:
00398 valid_chars[(uint8_t) 'A']=true;
00399 valid_chars[(uint8_t) 'C']=true;
00400 valid_chars[(uint8_t) 'G']=true;
00401 valid_chars[(uint8_t) 'U']=true;
00402
00403 maptable_to_bin[(uint8_t) 'A']=B_A;
00404 maptable_to_bin[(uint8_t) 'C']=B_C;
00405 maptable_to_bin[(uint8_t) 'G']=B_G;
00406 maptable_to_bin[(uint8_t) 'U']=B_T;
00407
00408 maptable_to_char[B_A]='A';
00409 maptable_to_char[B_C]='C';
00410 maptable_to_char[B_G]='G';
00411 maptable_to_char[B_T]='U';
00412 break;
00413
00414 case IUPAC_NUCLEIC_ACID:
00415 valid_chars[(uint8_t) 'A']=true;
00416 valid_chars[(uint8_t) 'C']=true;
00417 valid_chars[(uint8_t) 'G']=true;
00418 valid_chars[(uint8_t) 'T']=true;
00419 valid_chars[(uint8_t) 'U']=true;
00420 valid_chars[(uint8_t) 'R']=true;
00421 valid_chars[(uint8_t) 'Y']=true;
00422 valid_chars[(uint8_t) 'M']=true;
00423 valid_chars[(uint8_t) 'K']=true;
00424 valid_chars[(uint8_t) 'W']=true;
00425 valid_chars[(uint8_t) 'S']=true;
00426 valid_chars[(uint8_t) 'B']=true;
00427 valid_chars[(uint8_t) 'D']=true;
00428 valid_chars[(uint8_t) 'H']=true;
00429 valid_chars[(uint8_t) 'V']=true;
00430 valid_chars[(uint8_t) 'N']=true;
00431
00432 maptable_to_bin[(uint8_t) 'A']=0;
00433 maptable_to_bin[(uint8_t) 'C']=1;
00434 maptable_to_bin[(uint8_t) 'G']=2;
00435 maptable_to_bin[(uint8_t) 'T']=3;
00436 maptable_to_bin[(uint8_t) 'U']=4;
00437 maptable_to_bin[(uint8_t) 'R']=5;
00438 maptable_to_bin[(uint8_t) 'Y']=6;
00439 maptable_to_bin[(uint8_t) 'M']=7;
00440 maptable_to_bin[(uint8_t) 'K']=8;
00441 maptable_to_bin[(uint8_t) 'W']=9;
00442 maptable_to_bin[(uint8_t) 'S']=10;
00443 maptable_to_bin[(uint8_t) 'B']=11;
00444 maptable_to_bin[(uint8_t) 'D']=12;
00445 maptable_to_bin[(uint8_t) 'H']=13;
00446 maptable_to_bin[(uint8_t) 'V']=14;
00447 maptable_to_bin[(uint8_t) 'N']=15;
00448
00449 maptable_to_char[0]=(uint8_t) 'A';
00450 maptable_to_char[1]=(uint8_t) 'C';
00451 maptable_to_char[2]=(uint8_t) 'G';
00452 maptable_to_char[3]=(uint8_t) 'T';
00453 maptable_to_char[4]=(uint8_t) 'U';
00454 maptable_to_char[5]=(uint8_t) 'R';
00455 maptable_to_char[6]=(uint8_t) 'Y';
00456 maptable_to_char[7]=(uint8_t) 'M';
00457 maptable_to_char[8]=(uint8_t) 'K';
00458 maptable_to_char[9]=(uint8_t) 'W';
00459 maptable_to_char[10]=(uint8_t) 'S';
00460 maptable_to_char[11]=(uint8_t) 'B';
00461 maptable_to_char[12]=(uint8_t) 'D';
00462 maptable_to_char[13]=(uint8_t) 'H';
00463 maptable_to_char[14]=(uint8_t) 'V';
00464 maptable_to_char[15]=(uint8_t) 'N';
00465 break;
00466
00467 case IUPAC_AMINO_ACID:
00468 valid_chars[(uint8_t) 'A']=true;
00469 valid_chars[(uint8_t) 'R']=true;
00470 valid_chars[(uint8_t) 'N']=true;
00471 valid_chars[(uint8_t) 'D']=true;
00472 valid_chars[(uint8_t) 'C']=true;
00473 valid_chars[(uint8_t) 'Q']=true;
00474 valid_chars[(uint8_t) 'E']=true;
00475 valid_chars[(uint8_t) 'G']=true;
00476 valid_chars[(uint8_t) 'H']=true;
00477 valid_chars[(uint8_t) 'I']=true;
00478 valid_chars[(uint8_t) 'L']=true;
00479 valid_chars[(uint8_t) 'K']=true;
00480 valid_chars[(uint8_t) 'M']=true;
00481 valid_chars[(uint8_t) 'F']=true;
00482 valid_chars[(uint8_t) 'P']=true;
00483 valid_chars[(uint8_t) 'S']=true;
00484 valid_chars[(uint8_t) 'T']=true;
00485 valid_chars[(uint8_t) 'W']=true;
00486 valid_chars[(uint8_t) 'Y']=true;
00487 valid_chars[(uint8_t) 'V']=true;
00488 valid_chars[(uint8_t) 'B']=true;
00489 valid_chars[(uint8_t) 'Z']=true;
00490 valid_chars[(uint8_t) 'X']=true;
00491
00492 maptable_to_bin[(uint8_t) 'A']=0;
00493 maptable_to_bin[(uint8_t) 'R']=1;
00494 maptable_to_bin[(uint8_t) 'N']=2;
00495 maptable_to_bin[(uint8_t) 'D']=3;
00496 maptable_to_bin[(uint8_t) 'C']=4;
00497 maptable_to_bin[(uint8_t) 'Q']=5;
00498 maptable_to_bin[(uint8_t) 'E']=6;
00499 maptable_to_bin[(uint8_t) 'G']=7;
00500 maptable_to_bin[(uint8_t) 'H']=8;
00501 maptable_to_bin[(uint8_t) 'I']=9;
00502 maptable_to_bin[(uint8_t) 'L']=10;
00503 maptable_to_bin[(uint8_t) 'K']=11;
00504 maptable_to_bin[(uint8_t) 'M']=12;
00505 maptable_to_bin[(uint8_t) 'F']=13;
00506 maptable_to_bin[(uint8_t) 'P']=14;
00507 maptable_to_bin[(uint8_t) 'S']=15;
00508 maptable_to_bin[(uint8_t) 'T']=16;
00509 maptable_to_bin[(uint8_t) 'W']=17;
00510 maptable_to_bin[(uint8_t) 'Y']=18;
00511 maptable_to_bin[(uint8_t) 'V']=19;
00512 maptable_to_bin[(uint8_t) 'B']=20;
00513 maptable_to_bin[(uint8_t) 'Z']=21;
00514 maptable_to_bin[(uint8_t) 'X']=22;
00515
00516 maptable_to_char[0]=(uint8_t) 'A';
00517 maptable_to_char[1]=(uint8_t) 'R';
00518 maptable_to_char[2]=(uint8_t) 'N';
00519 maptable_to_char[3]=(uint8_t) 'D';
00520 maptable_to_char[4]=(uint8_t) 'C';
00521 maptable_to_char[5]=(uint8_t) 'Q';
00522 maptable_to_char[6]=(uint8_t) 'E';
00523 maptable_to_char[7]=(uint8_t) 'G';
00524 maptable_to_char[8]=(uint8_t) 'H';
00525 maptable_to_char[9]=(uint8_t) 'I';
00526 maptable_to_char[10]=(uint8_t) 'L';
00527 maptable_to_char[11]=(uint8_t) 'K';
00528 maptable_to_char[12]=(uint8_t) 'M';
00529 maptable_to_char[13]=(uint8_t) 'F';
00530 maptable_to_char[14]=(uint8_t) 'P';
00531 maptable_to_char[15]=(uint8_t) 'S';
00532 maptable_to_char[16]=(uint8_t) 'T';
00533 maptable_to_char[17]=(uint8_t) 'W';
00534 maptable_to_char[18]=(uint8_t) 'Y';
00535 maptable_to_char[19]=(uint8_t) 'V';
00536 maptable_to_char[20]=(uint8_t) 'B';
00537 maptable_to_char[21]=(uint8_t) 'Z';
00538 maptable_to_char[22]=(uint8_t) 'X';
00539 break;
00540
00541 default:
00542 break;
00543 };
00544 }
00545
00546 void CAlphabet::clear_histogram()
00547 {
00548 memset(histogram, 0, sizeof(histogram));
00549 print_histogram();
00550 }
00551
00552 int32_t CAlphabet::get_max_value_in_histogram()
00553 {
00554 int32_t max_sym=-1;
00555 for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00556 {
00557 if (histogram[i])
00558 {
00559 max_sym=i;
00560 break;
00561 }
00562 }
00563
00564 return max_sym;
00565 }
00566
00567 int32_t CAlphabet::get_num_symbols_in_histogram()
00568 {
00569 int32_t num_sym=0;
00570 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00571 {
00572 if (histogram[i])
00573 num_sym++;
00574 }
00575
00576 return num_sym;
00577 }
00578
00579 int32_t CAlphabet::get_num_bits_in_histogram()
00580 {
00581 int32_t num_sym=get_num_symbols_in_histogram();
00582 if (num_sym>0)
00583 return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00584 else
00585 return 0;
00586 }
00587
00588 void CAlphabet::print_histogram()
00589 {
00590 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00591 {
00592 if (histogram[i])
00593 SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00594 }
00595 }
00596
00597 bool CAlphabet::check_alphabet(bool print_error)
00598 {
00599 bool result = true;
00600
00601 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00602 {
00603 if (histogram[i]>0 && valid_chars[i]==0)
00604 {
00605 result=false;
00606 break;
00607 }
00608 }
00609
00610 if (!result && print_error)
00611 {
00612 print_histogram();
00613 SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00614 }
00615
00616 return result;
00617 }
00618
00619 bool CAlphabet::check_alphabet_size(bool print_error)
00620 {
00621 if (get_num_bits_in_histogram() > get_num_bits())
00622 {
00623 if (print_error)
00624 {
00625 print_histogram();
00626 fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00627 SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00628 }
00629 return false;
00630 }
00631 else
00632 return true;
00633
00634 }
00635
00636 void CAlphabet::copy_histogram(CAlphabet* a)
00637 {
00638 memcpy(histogram, a->get_histogram(), sizeof(histogram));
00639 }
00640
00641 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00642 {
00643
00644 int32_t idx;
00645 switch (alphabet)
00646 {
00647 case DNA:
00648 idx=0;
00649 break;
00650 case RAWDNA:
00651 idx=1;
00652 break;
00653 case RNA:
00654 idx=2;
00655 break;
00656 case PROTEIN:
00657 idx=3;
00658 break;
00659 case BINARY:
00660 idx=4;
00661 break;
00662 case ALPHANUM:
00663 idx=5;
00664 break;
00665 case CUBE:
00666 idx=6;
00667 break;
00668 case RAWBYTE:
00669 idx=7;
00670 break;
00671 case IUPAC_NUCLEIC_ACID:
00672 idx=8;
00673 break;
00674 case IUPAC_AMINO_ACID:
00675 idx=9;
00676 break;
00677 case NONE:
00678 idx=10;
00679 break;
00680 case DIGIT:
00681 idx=11;
00682 break;
00683 case DIGIT2:
00684 idx=12;
00685 break;
00686 default:
00687 idx=13;
00688 break;
00689 }
00690 return alphabet_names[idx];
00691 }
00692