00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024 DNA=0,
00025
00027 RAWDNA=1,
00028
00030 RNA=2,
00031
00033 PROTEIN=3,
00034
00035
00036 BINARY=4,
00037
00039 ALPHANUM=5,
00040
00042 CUBE=6,
00043
00045 RAWBYTE=7,
00046
00048 IUPAC_NUCLEIC_ACID=8,
00049
00051 IUPAC_AMINO_ACID=9,
00052
00054 NONE=10,
00055
00057 DIGIT=11,
00058
00060 DIGIT2=12,
00061
00063 RAWDIGIT=13,
00064
00066 RAWDIGIT2=14,
00067
00069 UNKNOWN=15,
00070
00072 SNP=16,
00073
00075 RAWSNP=17
00076 };
00077
00078
00089 class CAlphabet : public CSGObject
00090 {
00091 public:
00092
00096
00097
00098
00104 CAlphabet(char* alpha, int32_t len);
00105
00110 CAlphabet(EAlphabet alpha);
00111
00116 CAlphabet(CAlphabet* alpha);
00117 virtual ~CAlphabet();
00118
00123 bool set_alphabet(EAlphabet alpha);
00124
00129 inline EAlphabet get_alphabet() const
00130 {
00131 return alphabet;
00132 }
00133
00138 inline int32_t get_num_symbols() const
00139 {
00140 return num_symbols;
00141 }
00142
00148 inline int32_t get_num_bits() const
00149 {
00150 return num_bits;
00151 }
00152
00158 inline uint8_t remap_to_bin(uint8_t c)
00159 {
00160 return maptable_to_bin[c];
00161 }
00162
00168 inline uint8_t remap_to_char(uint8_t c)
00169 {
00170 return maptable_to_char[c];
00171 }
00172
00174 void clear_histogram();
00175
00181 template <class T>
00182 void add_string_to_histogram(T* p, int64_t len)
00183 {
00184 for (int64_t i=0; i<len; i++)
00185 add_byte_to_histogram((uint8_t) (p[i]));
00186 }
00187
00192 inline void add_byte_to_histogram(uint8_t p)
00193 {
00194 histogram[p]++;
00195 }
00196
00198 void print_histogram();
00199
00205 inline void get_hist(int64_t** h, int32_t* len)
00206 {
00207 int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00208 ASSERT(h && len);
00209 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00210 ASSERT(*h);
00211 *len=hist_size;
00212 ASSERT(*len);
00213 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00214 }
00215
00217 inline const int64_t* get_histogram()
00218 {
00219 return &histogram[0];
00220 }
00221
00228 bool check_alphabet(bool print_error=true);
00229
00236 inline bool is_valid(uint8_t c)
00237 {
00238 return valid_chars[c];
00239 }
00240
00246 bool check_alphabet_size(bool print_error=true);
00247
00252 int32_t get_num_symbols_in_histogram();
00253
00258 int32_t get_max_value_in_histogram();
00259
00266 int32_t get_num_bits_in_histogram();
00267
00272 static const char* get_alphabet_name(EAlphabet alphabet);
00273
00274
00276 inline virtual const char* get_name() const { return "Alphabet"; }
00277
00286 template <class ST>
00287 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00288 {
00289 int32_t i,j;
00290 ST value=0;
00291
00292 for (i=sequence_length-1; i>= p_order-1; i--)
00293 {
00294 value=0;
00295 for (j=i; j>=i-p_order+1; j--)
00296 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00297
00298 obs[i]= (ST) value;
00299 }
00300
00301 for (i=p_order-2;i>=0;i--)
00302 {
00303 if (i>=sequence_length)
00304 continue;
00305
00306 value=0;
00307 for (j=i; j>=i-p_order+1; j--)
00308 {
00309 value= (value >> max_val);
00310 if (j>=0 && j<sequence_length)
00311 value|=obs[j] << (max_val * (p_order-1));
00312 }
00313 obs[i]=value;
00314 }
00315
00316
00317 if (start>0)
00318 {
00319 for (i=start; i<sequence_length; i++)
00320 obs[i-start]=obs[i];
00321 }
00322 }
00323
00332 template <class ST>
00333 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00334 {
00335 int32_t i,j;
00336 ST value=0;
00337
00338 for (i=sequence_length-1; i>= p_order-1; i--)
00339 {
00340 value=0;
00341 for (j=i; j>=i-p_order+1; j--)
00342 value= (value << max_val) | obs[j];
00343
00344 obs[i]= (ST) value;
00345 }
00346
00347 for (i=p_order-2;i>=0;i--)
00348 {
00349 if (i>=sequence_length)
00350 continue;
00351
00352 value=0;
00353 for (j=i; j>=i-p_order+1; j--)
00354 {
00355 value= (value << max_val);
00356 if (j>=0 && j<sequence_length)
00357 value|=obs[j];
00358 }
00359 obs[i]=value;
00360 }
00361
00362
00363 if (start>0)
00364 {
00365 for (i=start; i<sequence_length; i++)
00366 obs[i-start]=obs[i];
00367 }
00368 }
00369
00379 template <class ST>
00380 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00381 {
00382 ASSERT(gap>=0);
00383
00384 const int32_t start_gap=(p_order-gap)/2;
00385 const int32_t end_gap=start_gap+gap;
00386
00387 int32_t i,j;
00388 ST value=0;
00389
00390
00391 for (i=sequence_length-1; i>=p_order-1; i--)
00392 {
00393 value=0;
00394 for (j=i; j>=i-p_order+1; j--)
00395 {
00396 if (i-j<start_gap)
00397 {
00398 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00399 }
00400 else if (i-j>=end_gap)
00401 {
00402 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00403 }
00404 }
00405 obs[i]= (ST) value;
00406 }
00407
00408
00409 for (i=p_order-2;i>=0;i--)
00410 {
00411 if (i>=sequence_length)
00412 continue;
00413
00414 value=0;
00415 for (j=i; j>=i-p_order+1; j--)
00416 {
00417 if (i-j<start_gap)
00418 {
00419 value= (value >> max_val);
00420 if (j>=0 && j<sequence_length)
00421 value|=obs[j] << (max_val * (p_order-1-gap));
00422 }
00423 else if (i-j>=end_gap)
00424 {
00425 value= (value >> max_val);
00426 if (j>=0 && j<sequence_length)
00427 value|=obs[j] << (max_val * (p_order-1-gap));
00428 }
00429 }
00430 obs[i]=value;
00431 }
00432
00433
00434 if (start>0)
00435 {
00436 for (i=start; i<sequence_length; i++)
00437 obs[i-start]=obs[i];
00438 }
00439 }
00440
00450 template <class ST>
00451 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00452 {
00453 ASSERT(gap>=0);
00454
00455 const int32_t start_gap=(p_order-gap)/2;
00456 const int32_t end_gap=start_gap+gap;
00457
00458 int32_t i,j;
00459 ST value=0;
00460
00461
00462 for (i=sequence_length-1; i>=p_order-1; i--)
00463 {
00464 value=0;
00465 for (j=i; j>=i-p_order+1; j--)
00466 {
00467 if (i-j<start_gap)
00468 value= (value << max_val) | obs[j];
00469 else if (i-j>=end_gap)
00470 value= (value << max_val) | obs[j];
00471 }
00472 obs[i]= (ST) value;
00473 }
00474
00475
00476 for (i=p_order-2;i>=0;i--)
00477 {
00478 if (i>=sequence_length)
00479 continue;
00480
00481 value=0;
00482 for (j=i; j>=i-p_order+1; j--)
00483 {
00484 if (i-j<start_gap)
00485 {
00486 value= value << max_val;
00487 if (j>=0 && j<sequence_length)
00488 value|=obs[j];
00489 }
00490 else if (i-j>=end_gap)
00491 {
00492 value= value << max_val;
00493 if (j>=0 && j<sequence_length)
00494 value|=obs[j];
00495 }
00496 }
00497 obs[i]=value;
00498 }
00499
00500
00501 if (start>0)
00502 {
00503 for (i=start; i<sequence_length; i++)
00504 obs[i-start]=obs[i];
00505 }
00506 }
00507
00508
00509 protected:
00511 void init_map_table();
00512
00517 void copy_histogram(CAlphabet* src);
00518
00519
00520
00521 #ifdef HAVE_BOOST_SERIALIZATION
00522 protected:
00523
00524 friend class ::boost::serialization::access;
00525
00526 template<class Archive>
00527
00528 void serialize(Archive & ar, const unsigned int archive_version)
00529 {
00530
00531 SG_DEBUG("archiving CAlphabet\n");
00532
00533 ar & ::boost::serialization::base_object<CSGObject>(*this);
00534
00535 ar & num_bits;
00536
00537 SG_DEBUG("done with CAlphabet\n");
00538
00539 }
00540
00541
00542 #endif //HAVE_BOOST_SERIALIZATION
00543
00544
00545 public:
00547 static const uint8_t B_A;
00549 static const uint8_t B_C;
00551 static const uint8_t B_G;
00553 static const uint8_t B_T;
00555 static const uint8_t B_0;
00557 static const uint8_t MAPTABLE_UNDEF;
00559 static const char* alphabet_names[18];
00560
00561 protected:
00563 EAlphabet alphabet;
00565 int32_t num_symbols;
00567 int32_t num_bits;
00569 bool valid_chars[1 << (sizeof(uint8_t)*8)];
00571 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00573 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00575 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00576 };
00577
00578
00579 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00580 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00581 {
00582 }
00583
00584 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00585 {
00586 }
00587
00588 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00589 {
00590 }
00591
00592 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00593 {
00594 }
00595
00596 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00597 {
00598 }
00599
00600 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00601 {
00602 }
00603 #endif
00604
00605 }
00606
00607 #ifdef HAVE_BOOST_SERIALIZATION
00608
00609 namespace boost {
00610 namespace serialization {
00611 template<class Archive>
00612 inline void save_construct_data(Archive & ar, shogun::CAlphabet* t, const unsigned int archive_version)
00613 {
00614 std::cout << "archiving construction data CAlphabet" << std::endl;
00615 ar << t->alphabet;
00616 std::cout << "done archiving construction data CAlphabet" << std::endl;
00617 }
00618
00619 template<class Archive>
00620 inline void load_construct_data(Archive & ar, shogun::CAlphabet * t, const unsigned int archive_version)
00621 {
00622
00623 shogun::EAlphabet a;
00624 ar >> a;
00625 new(t)shogun::CAlphabet(a);
00626
00627 }
00628 }
00629 }
00630 #endif //HAVE_BOOST_SERIALIZATION
00631
00632
00633
00634 #endif