Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 
00018 
00020 enum EAlphabet
00021 {
00023     DNA=0,
00024 
00026     RAWDNA=1,
00027 
00029     RNA=2,
00030 
00032     PROTEIN=3,
00033 
00035     ALPHANUM=5,
00036 
00038     CUBE=6,
00039 
00041     RAWBYTE=7,
00042 
00044     IUPAC_NUCLEIC_ACID=8,
00045 
00047     IUPAC_AMINO_ACID=9,
00048 
00050     NONE=10,
00051 
00053     UNKNOWN=11,
00054 };
00055 
00056 
00067 class CAlphabet : public CSGObject
00068 {
00069     public:
00075         CAlphabet(char* alpha, int32_t len);
00076 
00081         CAlphabet(EAlphabet alpha);
00082 
00087         CAlphabet(CAlphabet* alpha);
00088         virtual ~CAlphabet();
00089 
00094         bool set_alphabet(EAlphabet alpha);
00095 
00100         inline EAlphabet get_alphabet()
00101         {
00102             return alphabet;
00103         }
00104 
00109         inline int32_t get_num_symbols()
00110         {
00111             return num_symbols;
00112         }
00113 
00119         inline int32_t get_num_bits()
00120         {
00121             return num_bits;
00122         }
00123 
00129         inline uint8_t remap_to_bin(uint8_t c)
00130         {
00131             return maptable_to_bin[c];
00132         }
00133 
00139         inline uint8_t remap_to_char(uint8_t c)
00140         {
00141             return maptable_to_char[c];
00142         }
00143 
00145         void clear_histogram();
00146 
00152         template <class T>
00153         void add_string_to_histogram(T* p, int64_t len)
00154         {
00155             for (int64_t i=0; i<len; i++)
00156                 add_byte_to_histogram((uint8_t) (p[i]));
00157         }
00158 
00163         inline void add_byte_to_histogram(uint8_t p)
00164         {
00165             histogram[p]++;
00166         }
00167 
00169         void print_histogram();
00170 
00176         inline void get_hist(int64_t** h, int32_t* len)
00177         {
00178             int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00179             ASSERT(h && len);
00180             *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00181             ASSERT(*h);
00182             *len=hist_size;
00183             ASSERT(*len);
00184             memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00185         }
00186 
00188         inline const int64_t* get_histogram()
00189         {
00190             return &histogram[0];
00191         }
00192 
00199         bool check_alphabet(bool print_error=true);
00200 
00207         inline bool is_valid(uint8_t c)
00208         {
00209             return valid_chars[c];
00210         }
00211 
00217         bool check_alphabet_size(bool print_error=true);
00218 
00223         int32_t get_num_symbols_in_histogram();
00224 
00229         int32_t get_max_value_in_histogram();
00230 
00237         int32_t get_num_bits_in_histogram();
00238 
00243         static const char* get_alphabet_name(EAlphabet alphabet);
00244 
00245 
00247         inline virtual const char* get_name() const { return "Alphabet"; }
00248 
00249 
00250     protected:
00252         void init_map_table();
00253 
00258         void copy_histogram(CAlphabet* src);
00259 
00260     public:
00262         static const uint8_t B_A;
00264         static const uint8_t B_C;
00266         static const uint8_t B_G;
00268         static const uint8_t B_T;
00270         static const uint8_t MAPTABLE_UNDEF;
00272         static const char* alphabet_names[11];
00273 
00274     protected:
00276         EAlphabet alphabet;
00278         int32_t num_symbols;
00280         int32_t num_bits;
00282         bool valid_chars[1 << (sizeof(uint8_t)*8)];
00284         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00286         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00288         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00289 };
00290 #endif

SHOGUN Machine Learning Toolbox - Documentation