DynProg.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Written (W) 2008-2009 Jonas Behr
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015 
00016 #include "lib/Mathematics.h"
00017 #include "lib/common.h"
00018 #include "base/SGObject.h"
00019 #include "lib/io.h"
00020 #include "lib/config.h"
00021 #include "structure/PlifBase.h"
00022 #include "structure/Plif.h"
00023 #include "features/StringFeatures.h"
00024 #include "distributions/Distribution.h"
00025 #include "lib/DynamicArray.h"
00026 #include "lib/Array.h"
00027 #include "lib/Array2.h"
00028 #include "lib/Array3.h"
00029 #include "lib/Time.h"
00030 
00031 #include <stdio.h>
00032 
00033 //#define DYNPROG_TIMING
00034 
00035 #ifdef USE_BIGSTATES
00036 typedef uint16_t T_STATES ;
00037 #else
00038 typedef uint8_t T_STATES ;
00039 #endif
00040 typedef T_STATES* P_STATES ;
00041 
00047 class CDynProg : public CSGObject
00048 {
00049 private:
00050 
00051     T_STATES trans_list_len;
00052     T_STATES **trans_list_forward;
00053     T_STATES *trans_list_forward_cnt;
00054     float64_t **trans_list_forward_val;
00055     int32_t **trans_list_forward_id;
00056     bool mem_initialized;
00057 
00058 #ifdef DYNPROG_TIMING
00059     CTime MyTime;
00060     CTime MyTime2;
00061     
00062     float64_t segment_init_time;
00063     float64_t segment_pos_time;
00064     float64_t segment_clean_time;
00065     float64_t segment_extend_time;
00066     float64_t orf_time;
00067     float64_t content_time;
00068     float64_t content_penalty_time;
00069     float64_t svm_init_time;
00070     float64_t svm_pos_time;
00071     float64_t svm_clean_time;
00072 #endif
00073     
00074 public:
00079     CDynProg(int32_t p_num_svms=8);
00080     virtual ~CDynProg();
00081 
00090     float64_t best_path_no_b(int32_t max_iter, int32_t & best_iter, int32_t *my_path);
00091 
00100     void best_path_no_b_trans(int32_t max_iter, int32_t & max_best_iter, int16_t nbest, float64_t *prob_nbest, int32_t *my_paths);
00101     
00102     // model related functions
00108     void set_num_states(int32_t p_N);
00109 
00111     int32_t get_num_states();
00112 
00114     int32_t get_num_svms();
00115 
00122     void init_content_svm_value_array(const int32_t p_num_svms, const int32_t seq_len);
00123 
00132     void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes, const int32_t seq_len);
00133 
00142     void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs, const int32_t seq_len, const int32_t* pos);   
00143 
00150     void resize_lin_feat(int32_t num_new_feat, int32_t seq_len);
00156     void set_p_vector(float64_t* p, int32_t N);
00157 
00163     void set_q_vector(float64_t* q, int32_t N);
00164     
00171     void set_a(float64_t* a, int32_t M, int32_t N);
00172     
00179     void set_a_id(int32_t *a, int32_t M, int32_t N);
00180     
00187     void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00188 
00189     // content svm related setup functions
00195     void init_svm_arrays(int32_t p_num_degrees, int32_t p_num_svms);
00196 
00202     void init_word_degree_array(int32_t * p_word_degree_array, int32_t num_elem);
00203 
00209     void init_cum_num_words_array(int32_t * p_cum_num_words_array, int32_t num_elem);
00210 
00216     void init_num_words_array(int32_t * p_num_words_array, int32_t num_elem);
00217 
00224     void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00225 
00231     void init_sign_words_array(bool * p_sign_words_array, int32_t num_elem);
00232 
00238     void init_string_words_array(int32_t * p_string_words_array, int32_t num_elem);
00239 
00245     bool check_svm_arrays();
00246 
00247     // best_path_trans preparation functions
00254     void best_path_set_seq(float64_t *seq, int32_t N, int32_t seq_len);
00255 
00263     void best_path_set_seq3d(float64_t *seq, int32_t p_N, int32_t seq_len, int32_t max_num_signals);
00264 
00270     void best_path_set_pos(int32_t *pos, int32_t seq_len);
00271 
00279     void best_path_set_orf_info(int32_t *orf_info, int32_t m, int32_t n);
00280 
00288     void best_path_set_segment_sum_weights(float64_t *segment_sum_weights, int32_t num_states, int32_t seq_len);
00289 
00294     void best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs);
00295 
00302     void best_path_set_plif_id_matrix(int32_t *plif_id_matrix, int32_t m, int32_t n);
00303 
00310     void best_path_set_plif_state_signal_matrix(int32_t *plif_id_matrix, int32_t m, int32_t n);
00311 
00318     void best_path_set_genestr(char* genestr, int32_t genestr_len, int32_t genestr_num);
00319 
00320     // additional best_path_trans_deriv functions
00326     void best_path_set_my_state_seq(int32_t* my_state_seq, int32_t seq_len);
00327 
00333     void best_path_set_my_pos_seq(int32_t* my_pos_seq, int32_t seq_len);
00334 
00340     inline void best_path_set_single_genestr(char* genestr, int32_t genestr_len)
00341     {
00342         SG_DEBUG("genestrpy: %d", genestr_len);
00343         best_path_set_genestr(genestr, genestr_len, 1);
00344     }
00345 
00352     void best_path_set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00353 
00360     void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2);
00361 
00368     void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00369 
00370     // best_path functions
00376     void best_path_call(int32_t nbest, bool use_orf);
00377 
00379     void best_path_deriv_call();
00380 
00385     void best_path_2struct_call(int32_t nbest);
00386 
00391     void best_path_simple_call(int32_t nbest);
00392 
00397     void best_path_deriv_call(int32_t nbest);
00398     
00399     // best_path result retrieval functions
00405     void best_path_get_scores(float64_t **scores, int32_t *n);
00406 
00413     void best_path_get_states(int32_t **states, int32_t *m, int32_t *n);
00414 
00421     void best_path_get_positions(int32_t **positions, int32_t *m, int32_t *n);
00422 
00423     //best_path_trans_deriv result retrieval functions
00429     void best_path_get_losses(float64_t** my_losses, int32_t* seq_len);
00430 
00432 
00448     template <int16_t nbest, bool with_loss, bool with_multiple_sequences>
00449     void best_path_trans(const float64_t *seq, int32_t seq_len, const int32_t *pos,
00450                          const int32_t *orf_info, CPlifBase **PLif_matrix,
00451                          CPlifBase **Plif_state_signals, int32_t max_num_signals,
00452                          int32_t genestr_num,
00453                          float64_t *prob_nbest, int32_t *my_state_seq, int32_t *my_pos_seq,
00454                          bool use_orf);
00455 
00471     void best_path_trans_deriv(int32_t *my_state_seq, int32_t *my_pos_seq, float64_t *my_scores, float64_t* my_losses, int32_t my_seq_len,
00472                     const float64_t *seq_array, int32_t seq_len, const int32_t *pos, CPlifBase **Plif_matrix,
00473                     CPlifBase **Plif_state_signals, int32_t max_num_signals, int32_t genestr_num);
00474     
00491     void best_path_2struct(const float64_t *seq, int32_t seq_len, const int32_t *pos,
00492                            CPlifBase **Plif_matrix,
00493                            const char *genestr, int32_t genestr_len,
00494                            int16_t nbest,
00495                            float64_t *prob_nbest, int32_t *my_state_seq, int32_t *my_pos_seq,
00496                            float64_t *dictionary_weights, int32_t dict_len, float64_t *segment_sum_weights);
00497 
00506     void best_path_trans_simple(const float64_t *seq, int32_t seq_len, int16_t nbest,
00507                                 float64_t *prob_nbest, int32_t *my_state_seq);
00508 
00509 
00510 
00512     inline T_STATES get_N() const
00513       {
00514         return N ;
00515       }
00516     
00521     inline void set_q(T_STATES offset, float64_t value)
00522     {
00523         end_state_distribution_q[offset]=value;
00524     }
00525 
00530     inline void set_p(T_STATES offset, float64_t value)
00531     {
00532         initial_state_distribution_p[offset]=value;
00533     }
00534 
00541     inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00542     {
00543       transition_matrix_a.element(line_,column)=value; // look also best_path!
00544     }
00545 
00551     inline float64_t get_q(T_STATES offset) const
00552     {
00553         return end_state_distribution_q[offset];
00554     }
00555 
00561     inline float64_t get_q_deriv(T_STATES offset) const
00562     {
00563         return end_state_distribution_q_deriv[offset];
00564     }
00565 
00571     inline float64_t get_p(T_STATES offset) const
00572     {
00573         return initial_state_distribution_p[offset];
00574     }
00575 
00581     inline float64_t get_p_deriv(T_STATES offset) const
00582     {
00583         return initial_state_distribution_p_deriv[offset];
00584     }
00585     
00596     void precompute_content_values(uint16_t*** wordstr, const int32_t *pos,
00597         const int32_t num_cand_pos, const int32_t genestr_len,
00598         float64_t *dictionary_weights, int32_t dict_len);
00599 
00600 
00607     inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 
00608     {
00609         m_lin_feat.get_array_size(dim1, dim2);
00610         return m_lin_feat.get_array();
00611     }
00618     inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 
00619     {
00620         m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true);
00621     }
00630     void create_word_string(const char* genestr, int32_t genestr_num, int32_t genestr_len, uint16_t*** wordstr);
00631 
00637     void precompute_stop_codons(const char* genestr, int32_t genestr_len);
00638 
00644     void set_genestr_len(int32_t genestr_len);
00645 
00652     inline float64_t get_a(T_STATES line_, T_STATES column) const
00653     {
00654       return transition_matrix_a.element(line_,column); // look also best_path()!
00655     }
00656 
00663     inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00664     {
00665       return transition_matrix_a_deriv.element(line_,column); // look also best_path()!
00666     }
00668 protected:
00669 
00670     /* helper functions */
00671 
00681     inline void lookup_content_svm_values(const int32_t from_state,
00682         const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00683         float64_t* svm_values, int32_t frame);
00684 
00692     inline void lookup_tiling_plif_values(const int32_t from_state,
00693         const int32_t to_state, const int32_t len, float64_t* svm_values);
00694 
00699     inline int32_t find_frame(const int32_t from_state);
00700 
00709     inline int32_t raw_intensities_interval_query(
00710         const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00711 
00720     void translate_from_single_order(uint16_t* obs, int32_t sequence_length, int32_t start,
00721         int32_t order, int32_t max_val=2);
00722 
00729     void reset_svm_value(int32_t pos, int32_t & last_svm_pos, float64_t * svm_value);
00730 
00738     void extend_svm_value(uint16_t* wordstr, int32_t pos, int32_t &last_svm_pos,
00739         float64_t* svm_value);
00740 
00748     void reset_segment_sum_value(int32_t num_states, int32_t pos,
00749         int32_t & last_segment_sum_pos, float64_t * segment_sum_value);
00750 
00760     void extend_segment_sum_value(float64_t *segment_sum_weights, int32_t seqlen,
00761         int32_t num_states, int32_t pos, int32_t &last_segment_sum_pos,
00762         float64_t* segment_sum_value);
00763 
00765     struct svm_values_struct
00766     {
00768         int32_t maxlookback;
00770         int32_t seqlen;
00771 
00773         int32_t* start_pos;
00775         float64_t ** svm_values_unnormalized;
00777         float64_t * svm_values;
00779         bool *** word_used;
00781         int32_t **num_unique_words;
00782     };
00783 
00784     //void reset_svm_values(int32_t pos, int32_t * last_svm_pos, float64_t * svm_value) ;
00785     //void extend_svm_values(uint16_t** wordstr, int32_t pos, int32_t *last_svm_pos, float64_t* svm_value) ;
00793     void init_svm_values(struct svm_values_struct & svs, int32_t start_pos,
00794         int32_t seqlen, int32_t howmuchlookback);
00795 
00800     void clear_svm_values(struct svm_values_struct & svs);
00801 
00809     void find_svm_values_till_pos(uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00810         struct svm_values_struct &svs);
00811 
00819     void find_svm_values_till_pos(uint16_t** wordstr, const int32_t *pos, int32_t t_end,
00820         struct svm_values_struct &svs);
00821 
00830     void update_svm_values_till_pos(uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00831         int32_t prev_t_end, struct svm_values_struct &svs);
00832 
00841     bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00842 
00844     struct segment_loss_struct
00845     {
00847         int32_t maxlookback;
00849         int32_t seqlen;
00851         int32_t *segments_changed;
00853         float64_t *num_segment_id;
00855         int32_t *length_segment_id ;
00856     };
00857 
00864     void init_segment_loss(struct segment_loss_struct & loss, int32_t seqlen,
00865         int32_t howmuchlookback);
00866 
00871     void clear_segment_loss(struct segment_loss_struct & loss);
00872 
00883     float64_t extend_segment_loss(struct segment_loss_struct & loss,
00884         const int32_t * pos_array, int32_t segment_id, int32_t pos, int32_t& last_pos,
00885         float64_t &last_value);
00886 
00895     void find_segment_loss_till_pos(const int32_t * pos, int32_t t_end,
00896         CArray<int32_t>& segment_ids, CArray<float64_t>& segment_mask,
00897         struct segment_loss_struct& loss);
00898 
00900         inline virtual const char* get_name() const { return "DynProg"; }
00901     
00902 protected:
00907 
00908     int32_t N;
00909 
00911     CArray2<int32_t> transition_matrix_a_id;
00912     CArray2<float64_t> transition_matrix_a;
00913     CArray2<float64_t> transition_matrix_a_deriv;
00914 
00916     CArray<float64_t> initial_state_distribution_p;
00917     CArray<float64_t> initial_state_distribution_p_deriv;
00918 
00920     CArray<float64_t> end_state_distribution_q;
00921     CArray<float64_t> end_state_distribution_q_deriv;
00922 
00924     
00926     CArray2<float64_t> dict_weights;
00928     float64_t * dict_weights_array;
00929 
00931     int32_t num_degrees;
00933     int32_t num_svms;
00935     int32_t num_strings;
00936     
00938     CArray<int32_t> word_degree;
00940     CArray<int32_t> cum_num_words;
00942     int32_t * cum_num_words_array;
00944     CArray<int32_t> num_words;
00946     int32_t * num_words_array;
00948     CArray2<int32_t> mod_words;
00950     int32_t * mod_words_array;
00952     CArray<bool> sign_words;
00954     bool * sign_words_array;
00956     CArray<int32_t> string_words;
00958     int32_t * string_words_array;
00959 
00960 //  CArray3<int32_t> word_used ;
00961 //  int32_t *word_used_array ;
00962 //  CArray2<float64_t> svm_values_unnormalized ;
00964     CArray<int32_t> svm_pos_start;
00966     CArray<int32_t> num_unique_words;
00968     bool svm_arrays_clean;
00969 
00971     int32_t num_svms_single;
00973     int32_t word_degree_single;
00975     int32_t cum_num_words_single;
00977     int32_t num_words_single;
00978 
00980     CArray<bool> word_used_single;
00982     CArray<float64_t> svm_value_unnormalized_single;
00984     int32_t num_unique_words_single;
00985 
00987     int32_t max_a_id;
00988     
00989     // control info
00991     int32_t m_step;
00993     int32_t m_call;
00994 
00995     // input arguments
00997     CArray3<float64_t> m_seq;
00999     CArray<int32_t> m_pos;
01001     CArray2<int32_t> m_orf_info;
01003     CArray2<float64_t> m_segment_sum_weights;
01005     CArray<CPlifBase*> m_plif_list;
01007     CArray2<CPlifBase*> m_PEN;
01009     CArray2<CPlifBase*> m_PEN_state_signals;
01011     CArray2<char> m_genestr;
01013     CArray2<float64_t> m_dict_weights;
01015     CArray3<float64_t> m_segment_loss;
01017     CArray<int32_t> m_segment_ids;
01019     CArray<float64_t> m_segment_mask;
01021     CArray<int32_t> m_my_state_seq;
01023     CArray<int32_t> m_my_pos_seq;
01025     CArray<float64_t> m_my_scores;
01027     CArray<float64_t> m_my_losses;
01028 
01029     // output arguments
01031     CArray<float64_t> m_scores;
01033     CArray2<int32_t> m_states;
01035     CArray2<int32_t> m_positions;
01036 
01040     CArray<bool> m_genestr_stop;
01041 
01046     CArray2<float64_t> m_lin_feat;
01048     //int32_t m_num_lin_feat;
01049 
01050 
01052     float64_t *m_raw_intensities;
01054     int32_t* m_probe_pos;
01056     int32_t* m_num_probes_cum;
01058     int32_t* m_num_lin_feat_plifs_cum;
01060     int32_t m_num_raw_data;
01062     int32_t m_genestr_len;
01063 };
01064 
01065 inline int32_t CDynProg::raw_intensities_interval_query(const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type)
01066 {
01067     ASSERT(from_pos<to_pos);
01068     int32_t num_intensities = 0;
01069     int32_t* p_tiling_pos  = &m_probe_pos[m_num_probes_cum[type-1]];
01070     float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[type-1]];
01071     int32_t last_pos;
01072     int32_t num = m_num_probes_cum[type-1];
01073     while (*p_tiling_pos<to_pos)
01074     {
01075         if (*p_tiling_pos>=from_pos)
01076         {
01077             intensities[num_intensities] = *p_tiling_data;
01078             num_intensities++;
01079         }
01080         num++;
01081         if (num>=m_num_probes_cum[type])
01082             break;
01083         last_pos = *p_tiling_pos;
01084         p_tiling_pos++;
01085         p_tiling_data++;
01086         ASSERT(last_pos<*p_tiling_pos);
01087     }
01088     return num_intensities;
01089 }
01090 inline void CDynProg::lookup_content_svm_values(const int32_t from_state, const int32_t to_state, const int32_t from_pos, const int32_t to_pos, float64_t* svm_values, int32_t frame)
01091 {
01092 //  ASSERT(from_state<to_state);
01093 //  if (!(from_pos<to_pos))
01094 //      SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos);
01095     for (int32_t i=0;i<num_svms;i++)
01096     {
01097         float64_t to_val   = m_lin_feat.get_element(i,  to_state);
01098         float64_t from_val = m_lin_feat.get_element(i,from_state);
01099         svm_values[i]=(to_val-from_val)/(to_pos-from_pos);
01100     }
01101     for (int32_t i=num_svms;i<m_num_lin_feat_plifs_cum[m_num_raw_data];i++)
01102     {
01103         float64_t to_val   = m_lin_feat.get_element(i,  to_state);
01104         float64_t from_val = m_lin_feat.get_element(i,from_state);
01105         svm_values[i]=to_val-from_val;
01106     }
01107     // find the correct row with precomputed 
01108     if (frame!=-1)
01109     {
01110         svm_values[4] = 1e10;
01111         svm_values[5] = 1e10;
01112         svm_values[6] = 1e10;
01113         int32_t global_frame = from_pos%3;
01114             int32_t row = ((global_frame+frame)%3)+4;
01115         float64_t to_val   = m_lin_feat.get_element(row,  to_state);
01116         float64_t from_val = m_lin_feat.get_element(row,from_state);
01117         svm_values[frame+4] = (to_val-from_val)/(to_pos-from_pos);
01118     }
01119 }
01120 #endif

SHOGUN Machine Learning Toolbox - Documentation