Alphabet.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017
00018
00020 enum EAlphabet
00021 {
00023 DNA=0,
00024
00026 RAWDNA=1,
00027
00029 RNA=2,
00030
00032 PROTEIN=3,
00033
00035 ALPHANUM=5,
00036
00038 CUBE=6,
00039
00041 RAWBYTE=7,
00042
00044 IUPAC_NUCLEIC_ACID=8,
00045
00047 IUPAC_AMINO_ACID=9,
00048
00050 NONE=10,
00051
00053 UNKNOWN=11,
00054 };
00055
00056
00067 class CAlphabet : public CSGObject
00068 {
00069 public:
00075 CAlphabet(char* alpha, int32_t len);
00076
00081 CAlphabet(EAlphabet alpha);
00082
00087 CAlphabet(CAlphabet* alpha);
00088 virtual ~CAlphabet();
00089
00094 bool set_alphabet(EAlphabet alpha);
00095
00100 inline EAlphabet get_alphabet()
00101 {
00102 return alphabet;
00103 }
00104
00109 inline int32_t get_num_symbols()
00110 {
00111 return num_symbols;
00112 }
00113
00119 inline int32_t get_num_bits()
00120 {
00121 return num_bits;
00122 }
00123
00129 inline uint8_t remap_to_bin(uint8_t c)
00130 {
00131 return maptable_to_bin[c];
00132 }
00133
00139 inline uint8_t remap_to_char(uint8_t c)
00140 {
00141 return maptable_to_char[c];
00142 }
00143
00145 void clear_histogram();
00146
00152 template <class T>
00153 void add_string_to_histogram(T* p, int64_t len)
00154 {
00155 for (int64_t i=0; i<len; i++)
00156 add_byte_to_histogram((uint8_t) (p[i]));
00157 }
00158
00163 inline void add_byte_to_histogram(uint8_t p)
00164 {
00165 histogram[p]++;
00166 }
00167
00169 void print_histogram();
00170
00176 inline void get_hist(int64_t** h, int32_t* len)
00177 {
00178 int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00179 ASSERT(h && len);
00180 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00181 ASSERT(*h);
00182 *len=hist_size;
00183 ASSERT(*len);
00184 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00185 }
00186
00188 inline const int64_t* get_histogram()
00189 {
00190 return &histogram[0];
00191 }
00192
00199 bool check_alphabet(bool print_error=true);
00200
00207 inline bool is_valid(uint8_t c)
00208 {
00209 return valid_chars[c];
00210 }
00211
00217 bool check_alphabet_size(bool print_error=true);
00218
00223 int32_t get_num_symbols_in_histogram();
00224
00229 int32_t get_max_value_in_histogram();
00230
00237 int32_t get_num_bits_in_histogram();
00238
00243 static const char* get_alphabet_name(EAlphabet alphabet);
00244
00245
00247 inline virtual const char* get_name() const { return "Alphabet"; }
00248
00249
00250 protected:
00252 void init_map_table();
00253
00258 void copy_histogram(CAlphabet* src);
00259
00260 public:
00262 static const uint8_t B_A;
00264 static const uint8_t B_C;
00266 static const uint8_t B_G;
00268 static const uint8_t B_T;
00270 static const uint8_t MAPTABLE_UNDEF;
00272 static const char* alphabet_names[11];
00273
00274 protected:
00276 EAlphabet alphabet;
00278 int32_t num_symbols;
00280 int32_t num_bits;
00282 bool valid_chars[1 << (sizeof(uint8_t)*8)];
00284 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00286 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00288 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00289 };
00290 #endif