Alphabet.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <math.h>
00013 
00014 #include "features/Alphabet.h"
00015 #include "lib/io.h"
00016 
00017 
00018 #ifdef HAVE_BOOST_SERIALIZATION
00019 #include <boost/serialization/export.hpp>
00020 BOOST_CLASS_EXPORT(shogun::CAlphabet);
00021 #endif //HAVE_BOOST_SERIALIZATION
00022 
00023 
00024 using namespace shogun;
00025 
00026 //define numbers for the bases
00027 const uint8_t CAlphabet::B_A=0;
00028 const uint8_t CAlphabet::B_C=1;
00029 const uint8_t CAlphabet::B_G=2;
00030 const uint8_t CAlphabet::B_T=3;
00031 const uint8_t CAlphabet::B_0=4;
00032 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00033 const char* CAlphabet::alphabet_names[18]={
00034     "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM",
00035     "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID",
00036     "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN",
00037     "SNP", "RAWSNP"};
00038 
00039 /*
00040 CAlphabet::CAlphabet()
00041 : CSGObject()
00042 {
00043 
00044 }
00045 */
00046 
00047 CAlphabet::CAlphabet(char* al, int32_t len)
00048 : CSGObject()
00049 {
00050     EAlphabet alpha=NONE;
00051 
00052     if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00053         alpha = DNA;
00054     else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00055         alpha = RAWDNA;
00056     else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00057         alpha = RNA;
00058     else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00059         alpha = PROTEIN;
00060     else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00061         alpha = BINARY;
00062     else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00063         alpha = ALPHANUM;
00064     else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00065         alpha = CUBE;
00066     else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2")))
00067         alpha = DIGIT2;
00068     else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT")))
00069         alpha = DIGIT;
00070     else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2")))
00071         alpha = RAWDIGIT2;
00072     else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT")))
00073         alpha = RAWDIGIT;
00074     else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP")))
00075         alpha = SNP;
00076     else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP")))
00077         alpha = RAWSNP;
00078     else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00079             (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00080         alpha = RAWBYTE;
00081     else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00082         alpha = IUPAC_NUCLEIC_ACID;
00083     else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00084         alpha = IUPAC_AMINO_ACID;
00085     else {
00086       SG_ERROR( "unknown alphabet %s\n", al);
00087    }
00088 
00089     set_alphabet(alpha);
00090 }
00091 
00092 CAlphabet::CAlphabet(EAlphabet alpha)
00093 : CSGObject()
00094 {
00095     set_alphabet(alpha);
00096 }
00097 
00098 CAlphabet::CAlphabet(CAlphabet* a)
00099 : CSGObject()
00100 {
00101     ASSERT(a);
00102     set_alphabet(a->get_alphabet());
00103     copy_histogram(a);
00104 }
00105 
00106 CAlphabet::~CAlphabet()
00107 {
00108 }
00109 
00110 bool CAlphabet::set_alphabet(EAlphabet alpha)
00111 {
00112     bool result=true;
00113     alphabet=alpha;
00114 
00115     switch (alphabet)
00116     {
00117         case DNA:
00118         case RAWDNA:
00119             num_symbols = 4;
00120             break;
00121         case RNA:
00122             num_symbols = 4;
00123             break;
00124         case PROTEIN:
00125             num_symbols = 26;
00126             break;
00127         case BINARY:
00128             num_symbols = 2;
00129             break;
00130         case ALPHANUM:
00131             num_symbols = 36;
00132             break;
00133         case CUBE:
00134             num_symbols = 6;
00135             break;
00136         case RAWBYTE:
00137             num_symbols = 256;
00138             break;
00139         case IUPAC_NUCLEIC_ACID:
00140             num_symbols = 16;
00141             break;
00142         case IUPAC_AMINO_ACID:
00143             num_symbols = 23;
00144             break;
00145         case NONE:
00146             num_symbols = 0;
00147             break;
00148         case DIGIT2:
00149             num_symbols = 3;
00150             break;
00151         case DIGIT:
00152             num_symbols = 10;
00153             break;
00154         case RAWDIGIT2:
00155             num_symbols = 3;
00156             break;
00157         case RAWDIGIT:
00158             num_symbols = 10;
00159             break;
00160         case SNP:
00161             num_symbols = 5;
00162             break;
00163         case RAWSNP:
00164             num_symbols = 5;
00165             break;
00166         default:
00167             num_symbols = 0;
00168             result=false;
00169             break;
00170     }
00171 
00172     num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00173     init_map_table();
00174     clear_histogram();
00175 
00176     SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00177 
00178     return result;
00179 }
00180 
00181 void CAlphabet::init_map_table()
00182 {
00183     for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00184     {
00185         maptable_to_bin[i] = MAPTABLE_UNDEF;
00186         maptable_to_char[i] = MAPTABLE_UNDEF;
00187         valid_chars[i] = false;
00188     }
00189 
00190     switch (alphabet)
00191     {
00192         case RAWDIGIT:
00193             for (uint8_t i=0; i<=9; i++)
00194             {
00195                 valid_chars[i]=true;
00196                 maptable_to_bin[i]=i;
00197                 maptable_to_char[i]=i;
00198             }
00199             break;
00200 
00201         case RAWDIGIT2:
00202             for (uint8_t i=0; i<=2; i++)
00203             {
00204                 valid_chars[i]=true;
00205                 maptable_to_bin[i]=i;
00206                 maptable_to_char[i]=i;
00207             }
00208             break;
00209 
00210         case DIGIT:
00211             valid_chars[(uint8_t) '0']=true;
00212             valid_chars[(uint8_t) '1']=true;
00213             valid_chars[(uint8_t) '2']=true;
00214             valid_chars[(uint8_t) '3']=true;
00215             valid_chars[(uint8_t) '4']=true;
00216             valid_chars[(uint8_t) '5']=true;
00217             valid_chars[(uint8_t) '6']=true;
00218             valid_chars[(uint8_t) '7']=true;
00219             valid_chars[(uint8_t) '8']=true;
00220             valid_chars[(uint8_t) '9']=true;    //Translation '0-9' -> 0-9
00221 
00222             maptable_to_bin[(uint8_t) '0']=0;
00223             maptable_to_bin[(uint8_t) '1']=1;
00224             maptable_to_bin[(uint8_t) '2']=2;
00225             maptable_to_bin[(uint8_t) '3']=3;
00226             maptable_to_bin[(uint8_t) '4']=4;
00227             maptable_to_bin[(uint8_t) '5']=5;
00228             maptable_to_bin[(uint8_t) '6']=6;
00229             maptable_to_bin[(uint8_t) '7']=7;
00230             maptable_to_bin[(uint8_t) '8']=8;
00231             maptable_to_bin[(uint8_t) '9']=9;   //Translation '0-9' -> 0-9
00232 
00233             maptable_to_char[(uint8_t) 0]='0';
00234             maptable_to_char[(uint8_t) 1]='1';
00235             maptable_to_char[(uint8_t) 2]='2';
00236             maptable_to_char[(uint8_t) 3]='3';
00237             maptable_to_char[(uint8_t) 4]='4';
00238             maptable_to_char[(uint8_t) 5]='5';
00239             maptable_to_char[(uint8_t) 6]='6';
00240             maptable_to_char[(uint8_t) 7]='7';
00241             maptable_to_char[(uint8_t) 8]='8';
00242             maptable_to_char[(uint8_t) 9]='9';  //Translation 0-9 -> '0-9'
00243             break;
00244 
00245         case DIGIT2:
00246             valid_chars[(uint8_t) '0']=true;
00247             valid_chars[(uint8_t) '1']=true;
00248             valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2
00249 
00250             maptable_to_bin[(uint8_t) '0']=0;
00251             maptable_to_bin[(uint8_t) '1']=1;
00252             maptable_to_bin[(uint8_t) '2']=2;   //Translation '0-2' -> 0-2
00253 
00254             maptable_to_char[(uint8_t) 0]='0';
00255             maptable_to_char[(uint8_t) 1]='1';
00256             maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2'
00257             break;
00258 
00259         case CUBE:
00260             valid_chars[(uint8_t) '1']=true;
00261             valid_chars[(uint8_t) '2']=true;
00262             valid_chars[(uint8_t) '3']=true;
00263             valid_chars[(uint8_t) '4']=true;
00264             valid_chars[(uint8_t) '5']=true;
00265             valid_chars[(uint8_t) '6']=true;    //Translation '123456' -> 012345
00266 
00267             maptable_to_bin[(uint8_t) '1']=0;
00268             maptable_to_bin[(uint8_t) '2']=1;
00269             maptable_to_bin[(uint8_t) '3']=2;
00270             maptable_to_bin[(uint8_t) '4']=3;
00271             maptable_to_bin[(uint8_t) '5']=4;
00272             maptable_to_bin[(uint8_t) '6']=5;   //Translation '123456' -> 012345
00273 
00274             maptable_to_char[(uint8_t) 0]='1';
00275             maptable_to_char[(uint8_t) 1]='2';
00276             maptable_to_char[(uint8_t) 2]='3';
00277             maptable_to_char[(uint8_t) 3]='4';
00278             maptable_to_char[(uint8_t) 4]='5';
00279             maptable_to_char[(uint8_t) 5]='6';  //Translation 012345->'123456'
00280             break;
00281 
00282         case PROTEIN:
00283             {
00284                 int32_t skip=0 ;
00285                 for (int32_t i=0; i<21; i++)
00286                 {
00287                     if (i==1) skip++ ;
00288                     if (i==8) skip++ ;
00289                     if (i==12) skip++ ;
00290                     if (i==17) skip++ ;
00291                     valid_chars['A'+i+skip]=true;
00292                     maptable_to_bin['A'+i+skip]=i ;
00293                     maptable_to_char[i]='A'+i+skip ;
00294                 } ;                   //Translation 012345->acde...xy -- the protein code
00295             } ;
00296             break;
00297 
00298         case BINARY:
00299             valid_chars[(uint8_t) '0']=true;
00300             valid_chars[(uint8_t) '1']=true;
00301 
00302             maptable_to_bin[(uint8_t) '0']=0;
00303             maptable_to_bin[(uint8_t) '1']=1;
00304 
00305             maptable_to_char[0]=(uint8_t) '0';
00306             maptable_to_char[1]=(uint8_t) '1';
00307             break;
00308 
00309         case ALPHANUM:
00310             {
00311                 for (int32_t i=0; i<26; i++)
00312                 {
00313                     valid_chars['A'+i]=true;
00314                     maptable_to_bin['A'+i]=i ;
00315                     maptable_to_char[i]='A'+i ;
00316                 } ;
00317                 for (int32_t i=0; i<10; i++)
00318                 {
00319                     valid_chars['0'+i]=true;
00320                     maptable_to_bin['0'+i]=26+i ;
00321                     maptable_to_char[26+i]='0'+i ;
00322                 } ;        //Translation 012345->acde...xy0123456789
00323             } ;
00324             break;
00325 
00326         case RAWBYTE:
00327             {
00328                 //identity
00329                 for (int32_t i=0; i<256; i++)
00330                 {
00331                     valid_chars[i]=true;
00332                     maptable_to_bin[i]=i;
00333                     maptable_to_char[i]=i;
00334                 }
00335             }
00336             break;
00337 
00338         case DNA:
00339             valid_chars[(uint8_t) 'A']=true;
00340             valid_chars[(uint8_t) 'C']=true;
00341             valid_chars[(uint8_t) 'G']=true;
00342             valid_chars[(uint8_t) 'T']=true;
00343 
00344             maptable_to_bin[(uint8_t) 'A']=B_A;
00345             maptable_to_bin[(uint8_t) 'C']=B_C;
00346             maptable_to_bin[(uint8_t) 'G']=B_G;
00347             maptable_to_bin[(uint8_t) 'T']=B_T;
00348 
00349             maptable_to_char[B_A]='A';
00350             maptable_to_char[B_C]='C';
00351             maptable_to_char[B_G]='G';
00352             maptable_to_char[B_T]='T';
00353             break;
00354         case RAWDNA:
00355             {
00356                 //identity
00357                 for (int32_t i=0; i<4; i++)
00358                 {
00359                     valid_chars[i]=true;
00360                     maptable_to_bin[i]=i;
00361                     maptable_to_char[i]=i;
00362                 }
00363             }
00364             break;
00365 
00366         case SNP:
00367             valid_chars[(uint8_t) 'A']=true;
00368             valid_chars[(uint8_t) 'C']=true;
00369             valid_chars[(uint8_t) 'G']=true;
00370             valid_chars[(uint8_t) 'T']=true;
00371             valid_chars[(uint8_t) '0']=true;
00372 
00373             maptable_to_bin[(uint8_t) 'A']=B_A;
00374             maptable_to_bin[(uint8_t) 'C']=B_C;
00375             maptable_to_bin[(uint8_t) 'G']=B_G;
00376             maptable_to_bin[(uint8_t) 'T']=B_T;
00377             maptable_to_bin[(uint8_t) '0']=B_0;
00378 
00379             maptable_to_char[B_A]='A';
00380             maptable_to_char[B_C]='C';
00381             maptable_to_char[B_G]='G';
00382             maptable_to_char[B_T]='T';
00383             maptable_to_char[B_0]='0';
00384             break;
00385         case RAWSNP:
00386             {
00387                 //identity
00388                 for (int32_t i=0; i<5; i++)
00389                 {
00390                     valid_chars[i]=true;
00391                     maptable_to_bin[i]=i;
00392                     maptable_to_char[i]=i;
00393                 }
00394             }
00395             break;
00396 
00397         case RNA:
00398             valid_chars[(uint8_t) 'A']=true;
00399             valid_chars[(uint8_t) 'C']=true;
00400             valid_chars[(uint8_t) 'G']=true;
00401             valid_chars[(uint8_t) 'U']=true;
00402 
00403             maptable_to_bin[(uint8_t) 'A']=B_A;
00404             maptable_to_bin[(uint8_t) 'C']=B_C;
00405             maptable_to_bin[(uint8_t) 'G']=B_G;
00406             maptable_to_bin[(uint8_t) 'U']=B_T;
00407 
00408             maptable_to_char[B_A]='A';
00409             maptable_to_char[B_C]='C';
00410             maptable_to_char[B_G]='G';
00411             maptable_to_char[B_T]='U';
00412             break;
00413 
00414         case IUPAC_NUCLEIC_ACID:
00415             valid_chars[(uint8_t) 'A']=true; // A   Adenine
00416             valid_chars[(uint8_t) 'C']=true; // C   Cytosine
00417             valid_chars[(uint8_t) 'G']=true; // G   Guanine
00418             valid_chars[(uint8_t) 'T']=true; // T   Thymine
00419             valid_chars[(uint8_t) 'U']=true; // U   Uracil
00420             valid_chars[(uint8_t) 'R']=true; // R   Purine (A or G)
00421             valid_chars[(uint8_t) 'Y']=true; // Y   Pyrimidine (C, T, or U)
00422             valid_chars[(uint8_t) 'M']=true; // M   C or A
00423             valid_chars[(uint8_t) 'K']=true; // K   T, U, or G
00424             valid_chars[(uint8_t) 'W']=true; // W   T, U, or A
00425             valid_chars[(uint8_t) 'S']=true; // S   C or G
00426             valid_chars[(uint8_t) 'B']=true; // B   C, T, U, or G (not A)
00427             valid_chars[(uint8_t) 'D']=true; // D   A, T, U, or G (not C)
00428             valid_chars[(uint8_t) 'H']=true; // H   A, T, U, or C (not G)
00429             valid_chars[(uint8_t) 'V']=true; // V   A, C, or G (not T, not U)
00430             valid_chars[(uint8_t) 'N']=true; // N   Any base (A, C, G, T, or U)
00431 
00432             maptable_to_bin[(uint8_t) 'A']=0; // A  Adenine
00433             maptable_to_bin[(uint8_t) 'C']=1; // C  Cytosine
00434             maptable_to_bin[(uint8_t) 'G']=2; // G  Guanine
00435             maptable_to_bin[(uint8_t) 'T']=3; // T  Thymine
00436             maptable_to_bin[(uint8_t) 'U']=4; // U  Uracil
00437             maptable_to_bin[(uint8_t) 'R']=5; // R  Purine (A or G)
00438             maptable_to_bin[(uint8_t) 'Y']=6; // Y  Pyrimidine (C, T, or U)
00439             maptable_to_bin[(uint8_t) 'M']=7; // M  C or A
00440             maptable_to_bin[(uint8_t) 'K']=8; // K  T, U, or G
00441             maptable_to_bin[(uint8_t) 'W']=9; // W  T, U, or A
00442             maptable_to_bin[(uint8_t) 'S']=10; // S C or G
00443             maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A)
00444             maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C)
00445             maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G)
00446             maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U)
00447             maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U)
00448 
00449             maptable_to_char[0]=(uint8_t) 'A'; // A Adenine
00450             maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine
00451             maptable_to_char[2]=(uint8_t) 'G'; // G Guanine
00452             maptable_to_char[3]=(uint8_t) 'T'; // T Thymine
00453             maptable_to_char[4]=(uint8_t) 'U'; // U Uracil
00454             maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G)
00455             maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U)
00456             maptable_to_char[7]=(uint8_t) 'M'; // M C or A
00457             maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G
00458             maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A
00459             maptable_to_char[10]=(uint8_t) 'S'; // S    C or G
00460             maptable_to_char[11]=(uint8_t) 'B'; // B    C, T, U, or G (not A)
00461             maptable_to_char[12]=(uint8_t) 'D'; // D    A, T, U, or G (not C)
00462             maptable_to_char[13]=(uint8_t) 'H'; // H    A, T, U, or C (not G)
00463             maptable_to_char[14]=(uint8_t) 'V'; // V    A, C, or G (not T, not U)
00464             maptable_to_char[15]=(uint8_t) 'N'; // N    Any base (A, C, G, T, or U)
00465             break;
00466 
00467         case IUPAC_AMINO_ACID:
00468             valid_chars[(uint8_t) 'A']=true; //A    Ala Alanine
00469             valid_chars[(uint8_t) 'R']=true; //R    Arg Arginine
00470             valid_chars[(uint8_t) 'N']=true; //N    Asn Asparagine
00471             valid_chars[(uint8_t) 'D']=true; //D    Asp Aspartic acid
00472             valid_chars[(uint8_t) 'C']=true; //C    Cys Cysteine
00473             valid_chars[(uint8_t) 'Q']=true; //Q    Gln Glutamine
00474             valid_chars[(uint8_t) 'E']=true; //E    Glu Glutamic acid
00475             valid_chars[(uint8_t) 'G']=true; //G    Gly Glycine
00476             valid_chars[(uint8_t) 'H']=true; //H    His Histidine
00477             valid_chars[(uint8_t) 'I']=true; //I    Ile Isoleucine
00478             valid_chars[(uint8_t) 'L']=true; //L    Leu Leucine
00479             valid_chars[(uint8_t) 'K']=true; //K    Lys Lysine
00480             valid_chars[(uint8_t) 'M']=true; //M    Met Methionine
00481             valid_chars[(uint8_t) 'F']=true; //F    Phe Phenylalanine
00482             valid_chars[(uint8_t) 'P']=true; //P    Pro Proline
00483             valid_chars[(uint8_t) 'S']=true; //S    Ser Serine
00484             valid_chars[(uint8_t) 'T']=true; //T    Thr Threonine
00485             valid_chars[(uint8_t) 'W']=true; //W    Trp Tryptophan
00486             valid_chars[(uint8_t) 'Y']=true; //Y    Tyr Tyrosine
00487             valid_chars[(uint8_t) 'V']=true; //V    Val Valine
00488             valid_chars[(uint8_t) 'B']=true; //B    Asx Aspartic acid or Asparagine
00489             valid_chars[(uint8_t) 'Z']=true; //Z    Glx Glutamine or Glutamic acid
00490             valid_chars[(uint8_t) 'X']=true; //X    Xaa Any amino acid
00491 
00492             maptable_to_bin[(uint8_t) 'A']=0;  //A  Ala Alanine
00493             maptable_to_bin[(uint8_t) 'R']=1;  //R  Arg Arginine
00494             maptable_to_bin[(uint8_t) 'N']=2;  //N  Asn Asparagine
00495             maptable_to_bin[(uint8_t) 'D']=3;  //D  Asp Aspartic acid
00496             maptable_to_bin[(uint8_t) 'C']=4;  //C  Cys Cysteine
00497             maptable_to_bin[(uint8_t) 'Q']=5;  //Q  Gln Glutamine
00498             maptable_to_bin[(uint8_t) 'E']=6;  //E  Glu Glutamic acid
00499             maptable_to_bin[(uint8_t) 'G']=7;  //G  Gly Glycine
00500             maptable_to_bin[(uint8_t) 'H']=8;  //H  His Histidine
00501             maptable_to_bin[(uint8_t) 'I']=9;  //I  Ile Isoleucine
00502             maptable_to_bin[(uint8_t) 'L']=10; //L  Leu Leucine
00503             maptable_to_bin[(uint8_t) 'K']=11; //K  Lys Lysine
00504             maptable_to_bin[(uint8_t) 'M']=12; //M  Met Methionine
00505             maptable_to_bin[(uint8_t) 'F']=13; //F  Phe Phenylalanine
00506             maptable_to_bin[(uint8_t) 'P']=14; //P  Pro Proline
00507             maptable_to_bin[(uint8_t) 'S']=15; //S  Ser Serine
00508             maptable_to_bin[(uint8_t) 'T']=16; //T  Thr Threonine
00509             maptable_to_bin[(uint8_t) 'W']=17; //W  Trp Tryptophan
00510             maptable_to_bin[(uint8_t) 'Y']=18; //Y  Tyr Tyrosine
00511             maptable_to_bin[(uint8_t) 'V']=19; //V  Val Valine
00512             maptable_to_bin[(uint8_t) 'B']=20; //B  Asx Aspartic acid or Asparagine
00513             maptable_to_bin[(uint8_t) 'Z']=21; //Z  Glx Glutamine or Glutamic acid
00514             maptable_to_bin[(uint8_t) 'X']=22; //X  Xaa Any amino acid
00515 
00516             maptable_to_char[0]=(uint8_t) 'A';  //A Ala Alanine
00517             maptable_to_char[1]=(uint8_t) 'R';  //R Arg Arginine
00518             maptable_to_char[2]=(uint8_t) 'N';  //N Asn Asparagine
00519             maptable_to_char[3]=(uint8_t) 'D';  //D Asp Aspartic acid
00520             maptable_to_char[4]=(uint8_t) 'C';  //C Cys Cysteine
00521             maptable_to_char[5]=(uint8_t) 'Q';  //Q Gln Glutamine
00522             maptable_to_char[6]=(uint8_t) 'E';  //E Glu Glutamic acid
00523             maptable_to_char[7]=(uint8_t) 'G';  //G Gly Glycine
00524             maptable_to_char[8]=(uint8_t) 'H';  //H His Histidine
00525             maptable_to_char[9]=(uint8_t) 'I';  //I Ile Isoleucine
00526             maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine
00527             maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine
00528             maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine
00529             maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine
00530             maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline
00531             maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine
00532             maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine
00533             maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan
00534             maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine
00535             maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine
00536             maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine
00537             maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid
00538             maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid
00539             break;
00540 
00541         default:
00542             break; //leave uninitialised
00543     };
00544 }
00545 
00546 void CAlphabet::clear_histogram()
00547 {
00548     memset(histogram, 0, sizeof(histogram));
00549     print_histogram();
00550 }
00551 
00552 int32_t CAlphabet::get_max_value_in_histogram()
00553 {
00554     int32_t max_sym=-1;
00555     for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00556     {
00557         if (histogram[i])
00558         {
00559             max_sym=i;
00560             break;
00561         }
00562     }
00563 
00564     return max_sym;
00565 }
00566 
00567 int32_t CAlphabet::get_num_symbols_in_histogram()
00568 {
00569     int32_t num_sym=0;
00570     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00571     {
00572         if (histogram[i])
00573             num_sym++;
00574     }
00575 
00576     return num_sym;
00577 }
00578 
00579 int32_t CAlphabet::get_num_bits_in_histogram()
00580 {
00581     int32_t num_sym=get_num_symbols_in_histogram();
00582     if (num_sym>0)
00583         return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00584     else
00585         return 0;
00586 }
00587 
00588 void CAlphabet::print_histogram()
00589 {
00590     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00591     {
00592         if (histogram[i])
00593             SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00594     }
00595 }
00596 
00597 bool CAlphabet::check_alphabet(bool print_error)
00598 {
00599     bool result = true;
00600 
00601     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00602     {
00603         if (histogram[i]>0 && valid_chars[i]==0)
00604         {
00605             result=false;
00606             break;
00607         }
00608     }
00609 
00610     if (!result && print_error)
00611     {
00612         print_histogram();
00613         SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00614     }
00615 
00616     return result;
00617 }
00618 
00619 bool CAlphabet::check_alphabet_size(bool print_error)
00620 {
00621     if (get_num_bits_in_histogram() > get_num_bits())
00622     {
00623         if (print_error)
00624         {
00625             print_histogram();
00626             fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00627          SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00628         }
00629         return false;
00630     }
00631     else
00632         return true;
00633 
00634 }
00635 
00636 void CAlphabet::copy_histogram(CAlphabet* a)
00637 {
00638     memcpy(histogram, a->get_histogram(), sizeof(histogram));
00639 }
00640 
00641 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00642 {
00643 
00644     int32_t idx;
00645     switch (alphabet)
00646     {
00647         case DNA:
00648             idx=0;
00649             break;
00650         case RAWDNA:
00651             idx=1;
00652             break;
00653         case RNA:
00654             idx=2;
00655             break;
00656         case PROTEIN:
00657             idx=3;
00658             break;
00659         case BINARY:
00660             idx=4;
00661             break;
00662         case ALPHANUM:
00663             idx=5;
00664             break;
00665         case CUBE:
00666             idx=6;
00667             break;
00668         case RAWBYTE:
00669             idx=7;
00670             break;
00671         case IUPAC_NUCLEIC_ACID:
00672             idx=8;
00673             break;
00674         case IUPAC_AMINO_ACID:
00675             idx=9;
00676             break;
00677         case NONE:
00678             idx=10;
00679             break;
00680         case DIGIT:
00681             idx=11;
00682             break;
00683         case DIGIT2:
00684             idx=12;
00685             break;
00686         default:
00687             idx=13;
00688             break;
00689     }
00690     return alphabet_names[idx];
00691 }
00692 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation