00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015
00016 #include "lib/Mathematics.h"
00017 #include "lib/common.h"
00018 #include "base/SGObject.h"
00019 #include "lib/io.h"
00020 #include "lib/config.h"
00021 #include "structure/PlifBase.h"
00022 #include "structure/Plif.h"
00023 #include "features/StringFeatures.h"
00024 #include "distributions/Distribution.h"
00025 #include "lib/DynamicArray.h"
00026 #include "lib/Array.h"
00027 #include "lib/Array2.h"
00028 #include "lib/Array3.h"
00029 #include "lib/Time.h"
00030
00031 #include <stdio.h>
00032
00033
00034
00035 #ifdef USE_BIGSTATES
00036 typedef uint16_t T_STATES ;
00037 #else
00038 typedef uint8_t T_STATES ;
00039 #endif
00040 typedef T_STATES* P_STATES ;
00041
00047 class CDynProg : public CSGObject
00048 {
00049 private:
00050
00051 T_STATES trans_list_len;
00052 T_STATES **trans_list_forward;
00053 T_STATES *trans_list_forward_cnt;
00054 float64_t **trans_list_forward_val;
00055 int32_t **trans_list_forward_id;
00056 bool mem_initialized;
00057
00058 #ifdef DYNPROG_TIMING
00059 CTime MyTime;
00060 CTime MyTime2;
00061
00062 float64_t segment_init_time;
00063 float64_t segment_pos_time;
00064 float64_t segment_clean_time;
00065 float64_t segment_extend_time;
00066 float64_t orf_time;
00067 float64_t content_time;
00068 float64_t content_penalty_time;
00069 float64_t svm_init_time;
00070 float64_t svm_pos_time;
00071 float64_t svm_clean_time;
00072 #endif
00073
00074 public:
00079 CDynProg(int32_t p_num_svms=8);
00080 virtual ~CDynProg();
00081
00090 float64_t best_path_no_b(int32_t max_iter, int32_t & best_iter, int32_t *my_path);
00091
00100 void best_path_no_b_trans(int32_t max_iter, int32_t & max_best_iter, int16_t nbest, float64_t *prob_nbest, int32_t *my_paths);
00101
00102
00108 void set_num_states(int32_t p_N);
00109
00111 int32_t get_num_states();
00112
00114 int32_t get_num_svms();
00115
00122 void init_content_svm_value_array(const int32_t p_num_svms, const int32_t seq_len);
00123
00132 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes, const int32_t seq_len);
00133
00142 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs, const int32_t seq_len, const int32_t* pos);
00143
00150 void resize_lin_feat(int32_t num_new_feat, int32_t seq_len);
00156 void set_p_vector(float64_t* p, int32_t N);
00157
00163 void set_q_vector(float64_t* q, int32_t N);
00164
00171 void set_a(float64_t* a, int32_t M, int32_t N);
00172
00179 void set_a_id(int32_t *a, int32_t M, int32_t N);
00180
00187 void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00188
00189
00195 void init_svm_arrays(int32_t p_num_degrees, int32_t p_num_svms);
00196
00202 void init_word_degree_array(int32_t * p_word_degree_array, int32_t num_elem);
00203
00209 void init_cum_num_words_array(int32_t * p_cum_num_words_array, int32_t num_elem);
00210
00216 void init_num_words_array(int32_t * p_num_words_array, int32_t num_elem);
00217
00224 void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00225
00231 void init_sign_words_array(bool * p_sign_words_array, int32_t num_elem);
00232
00238 void init_string_words_array(int32_t * p_string_words_array, int32_t num_elem);
00239
00245 bool check_svm_arrays();
00246
00247
00254 void best_path_set_seq(float64_t *seq, int32_t N, int32_t seq_len);
00255
00263 void best_path_set_seq3d(float64_t *seq, int32_t p_N, int32_t seq_len, int32_t max_num_signals);
00264
00270 void best_path_set_pos(int32_t *pos, int32_t seq_len);
00271
00279 void best_path_set_orf_info(int32_t *orf_info, int32_t m, int32_t n);
00280
00288 void best_path_set_segment_sum_weights(float64_t *segment_sum_weights, int32_t num_states, int32_t seq_len);
00289
00294 void best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs);
00295
00302 void best_path_set_plif_id_matrix(int32_t *plif_id_matrix, int32_t m, int32_t n);
00303
00310 void best_path_set_plif_state_signal_matrix(int32_t *plif_id_matrix, int32_t m, int32_t n);
00311
00318 void best_path_set_genestr(char* genestr, int32_t genestr_len, int32_t genestr_num);
00319
00320
00326 void best_path_set_my_state_seq(int32_t* my_state_seq, int32_t seq_len);
00327
00333 void best_path_set_my_pos_seq(int32_t* my_pos_seq, int32_t seq_len);
00334
00340 inline void best_path_set_single_genestr(char* genestr, int32_t genestr_len)
00341 {
00342 SG_DEBUG("genestrpy: %d", genestr_len);
00343 best_path_set_genestr(genestr, genestr_len, 1);
00344 }
00345
00352 void best_path_set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00353
00360 void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2);
00361
00368 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00369
00370
00376 void best_path_call(int32_t nbest, bool use_orf);
00377
00379 void best_path_deriv_call();
00380
00385 void best_path_2struct_call(int32_t nbest);
00386
00391 void best_path_simple_call(int32_t nbest);
00392
00397 void best_path_deriv_call(int32_t nbest);
00398
00399
00405 void best_path_get_scores(float64_t **scores, int32_t *n);
00406
00413 void best_path_get_states(int32_t **states, int32_t *m, int32_t *n);
00414
00421 void best_path_get_positions(int32_t **positions, int32_t *m, int32_t *n);
00422
00423
00429 void best_path_get_losses(float64_t** my_losses, int32_t* seq_len);
00430
00432
00448 template <int16_t nbest, bool with_loss, bool with_multiple_sequences>
00449 void best_path_trans(const float64_t *seq, int32_t seq_len, const int32_t *pos,
00450 const int32_t *orf_info, CPlifBase **PLif_matrix,
00451 CPlifBase **Plif_state_signals, int32_t max_num_signals,
00452 int32_t genestr_num,
00453 float64_t *prob_nbest, int32_t *my_state_seq, int32_t *my_pos_seq,
00454 bool use_orf);
00455
00471 void best_path_trans_deriv(int32_t *my_state_seq, int32_t *my_pos_seq, float64_t *my_scores, float64_t* my_losses, int32_t my_seq_len,
00472 const float64_t *seq_array, int32_t seq_len, const int32_t *pos, CPlifBase **Plif_matrix,
00473 CPlifBase **Plif_state_signals, int32_t max_num_signals, int32_t genestr_num);
00474
00491 void best_path_2struct(const float64_t *seq, int32_t seq_len, const int32_t *pos,
00492 CPlifBase **Plif_matrix,
00493 const char *genestr, int32_t genestr_len,
00494 int16_t nbest,
00495 float64_t *prob_nbest, int32_t *my_state_seq, int32_t *my_pos_seq,
00496 float64_t *dictionary_weights, int32_t dict_len, float64_t *segment_sum_weights);
00497
00506 void best_path_trans_simple(const float64_t *seq, int32_t seq_len, int16_t nbest,
00507 float64_t *prob_nbest, int32_t *my_state_seq);
00508
00509
00510
00512 inline T_STATES get_N() const
00513 {
00514 return N ;
00515 }
00516
00521 inline void set_q(T_STATES offset, float64_t value)
00522 {
00523 end_state_distribution_q[offset]=value;
00524 }
00525
00530 inline void set_p(T_STATES offset, float64_t value)
00531 {
00532 initial_state_distribution_p[offset]=value;
00533 }
00534
00541 inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00542 {
00543 transition_matrix_a.element(line_,column)=value;
00544 }
00545
00551 inline float64_t get_q(T_STATES offset) const
00552 {
00553 return end_state_distribution_q[offset];
00554 }
00555
00561 inline float64_t get_q_deriv(T_STATES offset) const
00562 {
00563 return end_state_distribution_q_deriv[offset];
00564 }
00565
00571 inline float64_t get_p(T_STATES offset) const
00572 {
00573 return initial_state_distribution_p[offset];
00574 }
00575
00581 inline float64_t get_p_deriv(T_STATES offset) const
00582 {
00583 return initial_state_distribution_p_deriv[offset];
00584 }
00585
00596 void precompute_content_values(uint16_t*** wordstr, const int32_t *pos,
00597 const int32_t num_cand_pos, const int32_t genestr_len,
00598 float64_t *dictionary_weights, int32_t dict_len);
00599
00600
00607 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
00608 {
00609 m_lin_feat.get_array_size(dim1, dim2);
00610 return m_lin_feat.get_array();
00611 }
00618 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
00619 {
00620 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true);
00621 }
00630 void create_word_string(const char* genestr, int32_t genestr_num, int32_t genestr_len, uint16_t*** wordstr);
00631
00637 void precompute_stop_codons(const char* genestr, int32_t genestr_len);
00638
00644 void set_genestr_len(int32_t genestr_len);
00645
00652 inline float64_t get_a(T_STATES line_, T_STATES column) const
00653 {
00654 return transition_matrix_a.element(line_,column);
00655 }
00656
00663 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00664 {
00665 return transition_matrix_a_deriv.element(line_,column);
00666 }
00668 protected:
00669
00670
00671
00681 inline void lookup_content_svm_values(const int32_t from_state,
00682 const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00683 float64_t* svm_values, int32_t frame);
00684
00692 inline void lookup_tiling_plif_values(const int32_t from_state,
00693 const int32_t to_state, const int32_t len, float64_t* svm_values);
00694
00699 inline int32_t find_frame(const int32_t from_state);
00700
00709 inline int32_t raw_intensities_interval_query(
00710 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00711
00720 void translate_from_single_order(uint16_t* obs, int32_t sequence_length, int32_t start,
00721 int32_t order, int32_t max_val=2);
00722
00729 void reset_svm_value(int32_t pos, int32_t & last_svm_pos, float64_t * svm_value);
00730
00738 void extend_svm_value(uint16_t* wordstr, int32_t pos, int32_t &last_svm_pos,
00739 float64_t* svm_value);
00740
00748 void reset_segment_sum_value(int32_t num_states, int32_t pos,
00749 int32_t & last_segment_sum_pos, float64_t * segment_sum_value);
00750
00760 void extend_segment_sum_value(float64_t *segment_sum_weights, int32_t seqlen,
00761 int32_t num_states, int32_t pos, int32_t &last_segment_sum_pos,
00762 float64_t* segment_sum_value);
00763
00765 struct svm_values_struct
00766 {
00768 int32_t maxlookback;
00770 int32_t seqlen;
00771
00773 int32_t* start_pos;
00775 float64_t ** svm_values_unnormalized;
00777 float64_t * svm_values;
00779 bool *** word_used;
00781 int32_t **num_unique_words;
00782 };
00783
00784
00785
00793 void init_svm_values(struct svm_values_struct & svs, int32_t start_pos,
00794 int32_t seqlen, int32_t howmuchlookback);
00795
00800 void clear_svm_values(struct svm_values_struct & svs);
00801
00809 void find_svm_values_till_pos(uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00810 struct svm_values_struct &svs);
00811
00819 void find_svm_values_till_pos(uint16_t** wordstr, const int32_t *pos, int32_t t_end,
00820 struct svm_values_struct &svs);
00821
00830 void update_svm_values_till_pos(uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00831 int32_t prev_t_end, struct svm_values_struct &svs);
00832
00841 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00842
00844 struct segment_loss_struct
00845 {
00847 int32_t maxlookback;
00849 int32_t seqlen;
00851 int32_t *segments_changed;
00853 float64_t *num_segment_id;
00855 int32_t *length_segment_id ;
00856 };
00857
00864 void init_segment_loss(struct segment_loss_struct & loss, int32_t seqlen,
00865 int32_t howmuchlookback);
00866
00871 void clear_segment_loss(struct segment_loss_struct & loss);
00872
00883 float64_t extend_segment_loss(struct segment_loss_struct & loss,
00884 const int32_t * pos_array, int32_t segment_id, int32_t pos, int32_t& last_pos,
00885 float64_t &last_value);
00886
00895 void find_segment_loss_till_pos(const int32_t * pos, int32_t t_end,
00896 CArray<int32_t>& segment_ids, CArray<float64_t>& segment_mask,
00897 struct segment_loss_struct& loss);
00898
00900 inline virtual const char* get_name() const { return "DynProg"; }
00901
00902 protected:
00907
00908 int32_t N;
00909
00911 CArray2<int32_t> transition_matrix_a_id;
00912 CArray2<float64_t> transition_matrix_a;
00913 CArray2<float64_t> transition_matrix_a_deriv;
00914
00916 CArray<float64_t> initial_state_distribution_p;
00917 CArray<float64_t> initial_state_distribution_p_deriv;
00918
00920 CArray<float64_t> end_state_distribution_q;
00921 CArray<float64_t> end_state_distribution_q_deriv;
00922
00924
00926 CArray2<float64_t> dict_weights;
00928 float64_t * dict_weights_array;
00929
00931 int32_t num_degrees;
00933 int32_t num_svms;
00935 int32_t num_strings;
00936
00938 CArray<int32_t> word_degree;
00940 CArray<int32_t> cum_num_words;
00942 int32_t * cum_num_words_array;
00944 CArray<int32_t> num_words;
00946 int32_t * num_words_array;
00948 CArray2<int32_t> mod_words;
00950 int32_t * mod_words_array;
00952 CArray<bool> sign_words;
00954 bool * sign_words_array;
00956 CArray<int32_t> string_words;
00958 int32_t * string_words_array;
00959
00960
00961
00962
00964 CArray<int32_t> svm_pos_start;
00966 CArray<int32_t> num_unique_words;
00968 bool svm_arrays_clean;
00969
00971 int32_t num_svms_single;
00973 int32_t word_degree_single;
00975 int32_t cum_num_words_single;
00977 int32_t num_words_single;
00978
00980 CArray<bool> word_used_single;
00982 CArray<float64_t> svm_value_unnormalized_single;
00984 int32_t num_unique_words_single;
00985
00987 int32_t max_a_id;
00988
00989
00991 int32_t m_step;
00993 int32_t m_call;
00994
00995
00997 CArray3<float64_t> m_seq;
00999 CArray<int32_t> m_pos;
01001 CArray2<int32_t> m_orf_info;
01003 CArray2<float64_t> m_segment_sum_weights;
01005 CArray<CPlifBase*> m_plif_list;
01007 CArray2<CPlifBase*> m_PEN;
01009 CArray2<CPlifBase*> m_PEN_state_signals;
01011 CArray2<char> m_genestr;
01013 CArray2<float64_t> m_dict_weights;
01015 CArray3<float64_t> m_segment_loss;
01017 CArray<int32_t> m_segment_ids;
01019 CArray<float64_t> m_segment_mask;
01021 CArray<int32_t> m_my_state_seq;
01023 CArray<int32_t> m_my_pos_seq;
01025 CArray<float64_t> m_my_scores;
01027 CArray<float64_t> m_my_losses;
01028
01029
01031 CArray<float64_t> m_scores;
01033 CArray2<int32_t> m_states;
01035 CArray2<int32_t> m_positions;
01036
01040 CArray<bool> m_genestr_stop;
01041
01046 CArray2<float64_t> m_lin_feat;
01048
01049
01050
01052 float64_t *m_raw_intensities;
01054 int32_t* m_probe_pos;
01056 int32_t* m_num_probes_cum;
01058 int32_t* m_num_lin_feat_plifs_cum;
01060 int32_t m_num_raw_data;
01062 int32_t m_genestr_len;
01063 };
01064
01065 inline int32_t CDynProg::raw_intensities_interval_query(const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type)
01066 {
01067 ASSERT(from_pos<to_pos);
01068 int32_t num_intensities = 0;
01069 int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[type-1]];
01070 float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[type-1]];
01071 int32_t last_pos;
01072 int32_t num = m_num_probes_cum[type-1];
01073 while (*p_tiling_pos<to_pos)
01074 {
01075 if (*p_tiling_pos>=from_pos)
01076 {
01077 intensities[num_intensities] = *p_tiling_data;
01078 num_intensities++;
01079 }
01080 num++;
01081 if (num>=m_num_probes_cum[type])
01082 break;
01083 last_pos = *p_tiling_pos;
01084 p_tiling_pos++;
01085 p_tiling_data++;
01086 ASSERT(last_pos<*p_tiling_pos);
01087 }
01088 return num_intensities;
01089 }
01090 inline void CDynProg::lookup_content_svm_values(const int32_t from_state, const int32_t to_state, const int32_t from_pos, const int32_t to_pos, float64_t* svm_values, int32_t frame)
01091 {
01092
01093
01094
01095 for (int32_t i=0;i<num_svms;i++)
01096 {
01097 float64_t to_val = m_lin_feat.get_element(i, to_state);
01098 float64_t from_val = m_lin_feat.get_element(i,from_state);
01099 svm_values[i]=(to_val-from_val)/(to_pos-from_pos);
01100 }
01101 for (int32_t i=num_svms;i<m_num_lin_feat_plifs_cum[m_num_raw_data];i++)
01102 {
01103 float64_t to_val = m_lin_feat.get_element(i, to_state);
01104 float64_t from_val = m_lin_feat.get_element(i,from_state);
01105 svm_values[i]=to_val-from_val;
01106 }
01107
01108 if (frame!=-1)
01109 {
01110 svm_values[4] = 1e10;
01111 svm_values[5] = 1e10;
01112 svm_values[6] = 1e10;
01113 int32_t global_frame = from_pos%3;
01114 int32_t row = ((global_frame+frame)%3)+4;
01115 float64_t to_val = m_lin_feat.get_element(row, to_state);
01116 float64_t from_val = m_lin_feat.get_element(row,from_state);
01117 svm_values[frame+4] = (to_val-from_val)/(to_pos-from_pos);
01118 }
01119 }
01120 #endif