Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024     DNA=0,
00025 
00027     RAWDNA=1,
00028 
00030     RNA=2,
00031 
00033     PROTEIN=3,
00034 
00035     // BINARY just 0 and 1
00036     BINARY=4,
00037 
00039     ALPHANUM=5,
00040 
00042     CUBE=6,
00043 
00045     RAWBYTE=7,
00046 
00048     IUPAC_NUCLEIC_ACID=8,
00049 
00051     IUPAC_AMINO_ACID=9,
00052 
00054     NONE=10,
00055 
00057     DIGIT=11,
00058 
00060     DIGIT2=12,
00061 
00063     RAWDIGIT=13,
00064 
00066     RAWDIGIT2=14,
00067 
00069     UNKNOWN=15,
00070 
00072     SNP=16,
00073 
00075     RAWSNP=17
00076 };
00077 
00078 
00089 class CAlphabet : public CSGObject
00090 {
00091     public:
00092 
00096         //CAlphabet();
00097 
00098 
00104         CAlphabet(char* alpha, int32_t len);
00105 
00110         CAlphabet(EAlphabet alpha);
00111 
00116         CAlphabet(CAlphabet* alpha);
00117         virtual ~CAlphabet();
00118 
00123         bool set_alphabet(EAlphabet alpha);
00124 
00129         inline EAlphabet get_alphabet() const
00130         {
00131             return alphabet;
00132         }
00133 
00138         inline int32_t get_num_symbols() const
00139         {
00140             return num_symbols;
00141         }
00142 
00148         inline int32_t get_num_bits() const
00149         {
00150             return num_bits;
00151         }
00152 
00158         inline uint8_t remap_to_bin(uint8_t c)
00159         {
00160             return maptable_to_bin[c];
00161         }
00162 
00168         inline uint8_t remap_to_char(uint8_t c)
00169         {
00170             return maptable_to_char[c];
00171         }
00172 
00174         void clear_histogram();
00175 
00181         template <class T>
00182         void add_string_to_histogram(T* p, int64_t len)
00183         {
00184             for (int64_t i=0; i<len; i++)
00185                 add_byte_to_histogram((uint8_t) (p[i]));
00186         }
00187 
00192         inline void add_byte_to_histogram(uint8_t p)
00193         {
00194             histogram[p]++;
00195         }
00196 
00198         void print_histogram();
00199 
00205         inline void get_hist(int64_t** h, int32_t* len)
00206         {
00207             int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00208             ASSERT(h && len);
00209             *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00210             ASSERT(*h);
00211             *len=hist_size;
00212             ASSERT(*len);
00213             memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00214         }
00215 
00217         inline const int64_t* get_histogram()
00218         {
00219             return &histogram[0];
00220         }
00221 
00228         bool check_alphabet(bool print_error=true);
00229 
00236         inline bool is_valid(uint8_t c)
00237         {
00238             return valid_chars[c];
00239         }
00240 
00246         bool check_alphabet_size(bool print_error=true);
00247 
00252         int32_t get_num_symbols_in_histogram();
00253 
00258         int32_t get_max_value_in_histogram();
00259 
00266         int32_t get_num_bits_in_histogram();
00267 
00272         static const char* get_alphabet_name(EAlphabet alphabet);
00273 
00274 
00276         inline virtual const char* get_name() const { return "Alphabet"; }
00277 
00286         template <class ST>
00287         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00288         {
00289             int32_t i,j;
00290             ST value=0;
00291 
00292             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00293             {
00294                 value=0;
00295                 for (j=i; j>=i-p_order+1; j--)
00296                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00297 
00298                 obs[i]= (ST) value;
00299             }
00300 
00301             for (i=p_order-2;i>=0;i--)
00302             {
00303                 if (i>=sequence_length)
00304                     continue;
00305 
00306                 value=0;
00307                 for (j=i; j>=i-p_order+1; j--)
00308                 {
00309                     value= (value >> max_val);
00310                     if (j>=0 && j<sequence_length)
00311                         value|=obs[j] << (max_val * (p_order-1));
00312                 }
00313                 obs[i]=value;
00314             }
00315 
00316             // TODO we should get rid of this loop!
00317             if (start>0)
00318             {
00319                 for (i=start; i<sequence_length; i++)
00320                     obs[i-start]=obs[i];
00321             }
00322         }
00323 
00332         template <class ST>
00333         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00334         {
00335             int32_t i,j;
00336             ST value=0;
00337 
00338             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00339             {
00340                 value=0;
00341                 for (j=i; j>=i-p_order+1; j--)
00342                     value= (value << max_val) | obs[j];
00343 
00344                 obs[i]= (ST) value;
00345             }
00346 
00347             for (i=p_order-2;i>=0;i--)
00348             {
00349                 if (i>=sequence_length)
00350                     continue;
00351 
00352                 value=0;
00353                 for (j=i; j>=i-p_order+1; j--)
00354                 {
00355                     value= (value << max_val);
00356                     if (j>=0 && j<sequence_length)
00357                         value|=obs[j];
00358                 }
00359                 obs[i]=value;
00360             }
00361 
00362             // TODO we should get rid of this loop!
00363             if (start>0)
00364             {
00365                 for (i=start; i<sequence_length; i++)
00366                     obs[i-start]=obs[i];
00367             }
00368         }
00369 
00379         template <class ST>
00380         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00381         {
00382             ASSERT(gap>=0);
00383 
00384             const int32_t start_gap=(p_order-gap)/2;
00385             const int32_t end_gap=start_gap+gap;
00386 
00387             int32_t i,j;
00388             ST value=0;
00389 
00390             // almost all positions
00391             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00392             {
00393                 value=0;
00394                 for (j=i; j>=i-p_order+1; j--)
00395                 {
00396                     if (i-j<start_gap)
00397                     {
00398                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00399                     }
00400                     else if (i-j>=end_gap)
00401                     {
00402                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00403                     }
00404                 }
00405                 obs[i]= (ST) value;
00406             }
00407 
00408             // the remaining `order` positions
00409             for (i=p_order-2;i>=0;i--)
00410             {
00411                 if (i>=sequence_length)
00412                     continue;
00413 
00414                 value=0;
00415                 for (j=i; j>=i-p_order+1; j--)
00416                 {
00417                     if (i-j<start_gap)
00418                     {
00419                         value= (value >> max_val);
00420                         if (j>=0 && j<sequence_length)
00421                             value|=obs[j] << (max_val * (p_order-1-gap));
00422                     }
00423                     else if (i-j>=end_gap)
00424                     {
00425                         value= (value >> max_val);
00426                         if (j>=0 && j<sequence_length)
00427                             value|=obs[j] << (max_val * (p_order-1-gap));
00428                     }
00429                 }
00430                 obs[i]=value;
00431             }
00432 
00433             // TODO we should get rid of this loop!
00434             if (start>0)
00435             {
00436                 for (i=start; i<sequence_length; i++)
00437                     obs[i-start]=obs[i];
00438             }
00439         }
00440 
00450         template <class ST>
00451         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00452         {
00453             ASSERT(gap>=0);
00454 
00455             const int32_t start_gap=(p_order-gap)/2;
00456             const int32_t end_gap=start_gap+gap;
00457 
00458             int32_t i,j;
00459             ST value=0;
00460 
00461             // almost all positions
00462             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00463             {
00464                 value=0;
00465                 for (j=i; j>=i-p_order+1; j--)
00466                 {
00467                     if (i-j<start_gap)
00468                         value= (value << max_val) | obs[j];
00469                     else if (i-j>=end_gap)
00470                         value= (value << max_val) | obs[j];
00471                 }
00472                 obs[i]= (ST) value;
00473             }
00474 
00475             // the remaining `order` positions
00476             for (i=p_order-2;i>=0;i--)
00477             {
00478                 if (i>=sequence_length)
00479                     continue;
00480 
00481                 value=0;
00482                 for (j=i; j>=i-p_order+1; j--)
00483                 {
00484                     if (i-j<start_gap)
00485                     {
00486                         value= value << max_val;
00487                         if (j>=0 && j<sequence_length)
00488                             value|=obs[j];
00489                     }
00490                     else if (i-j>=end_gap)
00491                     {
00492                         value= value << max_val;
00493                         if (j>=0 && j<sequence_length)
00494                             value|=obs[j];
00495                     }
00496                 }
00497                 obs[i]=value;
00498             }
00499 
00500             // TODO we should get rid of this loop!
00501             if (start>0)
00502             {
00503                 for (i=start; i<sequence_length; i++)
00504                     obs[i-start]=obs[i];
00505             }
00506         }
00507 
00508 
00509     protected:
00511         void init_map_table();
00512 
00517         void copy_histogram(CAlphabet* src);
00518 
00519 
00520 
00521 #ifdef HAVE_BOOST_SERIALIZATION
00522     protected:
00523 
00524         friend class ::boost::serialization::access;
00525 
00526         template<class Archive>
00527 
00528             void serialize(Archive & ar, const unsigned int archive_version)
00529             {
00530 
00531                 SG_DEBUG("archiving CAlphabet\n");
00532 
00533                 ar & ::boost::serialization::base_object<CSGObject>(*this);
00534 
00535                 ar & num_bits;
00536 
00537                 SG_DEBUG("done with CAlphabet\n");
00538 
00539             }
00540 
00541 
00542 #endif //HAVE_BOOST_SERIALIZATION
00543 
00544 
00545     public:
00547         static const uint8_t B_A;
00549         static const uint8_t B_C;
00551         static const uint8_t B_G;
00553         static const uint8_t B_T;
00555         static const uint8_t B_0;
00557         static const uint8_t MAPTABLE_UNDEF;
00559         static const char* alphabet_names[18];
00560 
00561     protected:
00563         EAlphabet alphabet;
00565         int32_t num_symbols;
00567         int32_t num_bits;
00569         bool valid_chars[1 << (sizeof(uint8_t)*8)];
00571         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00573         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00575         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00576 };
00577 
00578 
00579 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00580 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00581 {
00582 }
00583 
00584 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00585 {
00586 }
00587 
00588 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00589 {
00590 }
00591 
00592 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00593 {
00594 }
00595 
00596 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00597 {
00598 }
00599 
00600 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00601 {
00602 }
00603 #endif
00604 
00605 }
00606 
00607 #ifdef HAVE_BOOST_SERIALIZATION
00608 
00609 namespace boost {
00610 namespace serialization {
00611    template<class Archive>
00612     inline void save_construct_data(Archive & ar, shogun::CAlphabet* t, const unsigned int archive_version)
00613     {
00614       std::cout << "archiving construction data CAlphabet" << std::endl;
00615       ar << t->alphabet;
00616       std::cout << "done archiving construction data CAlphabet" << std::endl;
00617     }
00618 
00619     template<class Archive>
00620     inline void load_construct_data(Archive & ar, shogun::CAlphabet * t, const unsigned int archive_version)
00621     {
00622       //SG_DEBUG("archiving construction data CAlphabet\n");
00623       shogun::EAlphabet a;
00624       ar >> a;
00625       new(t)shogun::CAlphabet(a);
00626       //SG_DEBUG("done construction data archiving CAlphabet\n");
00627     }
00628 } // serialization
00629 } // namespace boost
00630 #endif //HAVE_BOOST_SERIALIZATION
00631 
00632 
00633 
00634 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation