00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _TRIE_H___
00013 #define _TRIE_H___
00014
00015 #include <string.h>
00016 #include "lib/common.h"
00017 #include "lib/io.h"
00018 #include "lib/DynamicArray.h"
00019 #include "lib/Mathematics.h"
00020 #include "base/SGObject.h"
00021
00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00023
00024
00025 #define NO_CHILD ((int32_t)-1073741824)
00026
00027 #define WEIGHTS_IN_TRIE
00028
00029
00030 #ifdef TRIE_CHECK_EVERYTHING
00031 #define TRIE_ASSERT_EVERYTHING(x) ASSERT(x)
00032 #else
00033 #define TRIE_ASSERT_EVERYTHING(x)
00034 #endif
00035
00036
00037 #define TRIE_ASSERT(x)
00038
00039 #define TRIE_TERMINAL_CHARACTER 7
00040
00042 struct ConsensusEntry
00043 {
00045 uint64_t string;
00047 float32_t score;
00049 int32_t bt;
00050 };
00051
00053 struct POIMTrie
00054 {
00056 float64_t weight;
00057 #ifdef TRIE_CHECK_EVERYTHING
00058
00059 bool has_seq;
00061 bool has_floats;
00062 #endif
00063 union
00064 {
00066 float32_t child_weights[4];
00068 int32_t children[4];
00070 uint8_t seq[16] ;
00071 };
00072
00074 float64_t S;
00076 float64_t L;
00078 float64_t R;
00079 };
00080
00082 struct DNATrie
00083 {
00085 float64_t weight;
00086 #ifdef TRIE_CHECK_EVERYTHING
00087
00088 bool has_seq;
00090 bool has_floats;
00091 #endif
00092 union
00093 {
00095 float32_t child_weights[4];
00097 int32_t children[4];
00099 uint8_t seq[16] ;
00100 };
00101 };
00102
00104 struct TreeParseInfo {
00106 int32_t num_sym;
00108 int32_t num_feat;
00110 int32_t p;
00112 int32_t k;
00114 int32_t* nofsKmers;
00116 float64_t* margFactors;
00118 int32_t* x;
00120 int32_t* substrs;
00122 int32_t y0;
00124 float64_t* C_k;
00126 float64_t* L_k;
00128 float64_t* R_k;
00129 };
00130
00131 #endif // DOXYGEN_SHOULD_SKIP_THIS
00132
00133 template <class Trie> class CTrie;
00134
00153 template <class Trie> class CTrie : public CSGObject
00154 {
00155 public:
00162 CTrie(int32_t d, bool p_use_compact_terminal_nodes=true);
00163
00165 CTrie(const CTrie & to_copy);
00166 virtual ~CTrie();
00167
00169 const CTrie & operator=(const CTrie & to_copy);
00170
00178 bool compare_traverse(
00179 int32_t node, const CTrie & other, int32_t other_node);
00180
00186 bool compare(const CTrie & other);
00187
00194 bool find_node(int32_t node, int32_t * trace, int32_t &trace_len) const;
00195
00202 int32_t find_deepest_node(
00203 int32_t start_node, int32_t &deepest_node) const;
00204
00209 void display_node(int32_t node) const;
00210
00212 void destroy();
00213
00218 void set_degree(int32_t d);
00219
00226 void create(int32_t len, bool p_use_compact_terminal_nodes=true);
00227
00233 void delete_trees(bool p_use_compact_terminal_nodes=true);
00234
00245 void add_to_trie(
00246 int32_t i, int32_t seq_offset, int32_t* vec, float32_t alpha,
00247 float64_t *weights, bool degree_times_position_weights);
00248
00255 float64_t compute_abs_weights_tree(int32_t tree, int32_t depth);
00256
00262 float64_t* compute_abs_weights(int32_t &len);
00263
00276 float64_t compute_by_tree_helper(
00277 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
00278 int32_t weight_pos, float64_t * weights,
00279 bool degree_times_position_weights) ;
00280
00295 void compute_by_tree_helper(
00296 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
00297 int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
00298 int32_t mkl_stepsize, float64_t * weights,
00299 bool degree_times_position_weights);
00300
00315 void compute_scoring_helper(
00316 int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
00317 int32_t max_degree, int32_t num_feat, int32_t num_sym,
00318 int32_t sym_offset, int32_t offs, float64_t* result);
00319
00332 void add_example_to_tree_mismatch_recursion(
00333 int32_t tree, int32_t i, float64_t alpha, int32_t *vec,
00334 int32_t len_rem, int32_t degree_rec, int32_t mismatch_rec,
00335 int32_t max_mismatch, float64_t * weights);
00336
00346 void traverse(
00347 int32_t tree, const int32_t p, struct TreeParseInfo info,
00348 const int32_t depth, int32_t* const x, const int32_t k);
00349
00359 void count(
00360 const float64_t w, const int32_t depth,
00361 const struct TreeParseInfo info, const int32_t p, int32_t* x,
00362 const int32_t k);
00363
00370 int32_t compact_nodes(int32_t start_node, int32_t depth, float64_t * weights);
00371
00380 float64_t get_cumulative_score(
00381 int32_t pos, uint64_t seq, int32_t deg, float64_t* weights);
00382
00392 void fill_backtracking_table_recursion(
00393 Trie* tree, int32_t depth, uint64_t seq, float64_t value,
00394 CDynamicArray<ConsensusEntry>* table, float64_t* weights);
00395
00404 void fill_backtracking_table(
00405 int32_t pos, CDynamicArray<ConsensusEntry>* prev,
00406 CDynamicArray<ConsensusEntry>* cur, bool cumulative,
00407 float64_t* weights);
00408
00414 void POIMs_extract_W(float64_t* const* const W, const int32_t K);
00415
00420 void POIMs_precalc_SLR(const float64_t* const distrib);
00421
00431 void POIMs_get_SLR(
00432 const int32_t parentIdx, const int32_t sym, const int32_t depth,
00433 float64_t* S, float64_t* L, float64_t* R);
00434
00441 void POIMs_add_SLR(
00442 float64_t* const* const poims, const int32_t K,
00443 const int32_t debug);
00444
00449 inline bool get_use_compact_terminal_nodes()
00450 {
00451 return use_compact_terminal_nodes ;
00452 }
00453
00459 inline void set_use_compact_terminal_nodes(
00460 bool p_use_compact_terminal_nodes)
00461 {
00462 use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
00463 }
00464
00469 inline int32_t get_num_used_nodes()
00470 {
00471 return TreeMemPtr;
00472 }
00473
00478 inline void set_position_weights(const float64_t * p_position_weights)
00479 {
00480 position_weights=p_position_weights;
00481 }
00482
00487 inline int32_t get_node(bool last_node=false)
00488 {
00489 int32_t ret = TreeMemPtr++;
00490 check_treemem() ;
00491
00492 if (last_node)
00493 {
00494 for (int32_t q=0; q<4; q++)
00495 TreeMem[ret].child_weights[q]=0.0;
00496 }
00497 else
00498 {
00499 for (int32_t q=0; q<4; q++)
00500 TreeMem[ret].children[q]=NO_CHILD;
00501 }
00502 #ifdef TRIE_CHECK_EVERYTHING
00503 TreeMem[ret].has_seq=false ;
00504 TreeMem[ret].has_floats=false ;
00505 #endif
00506 TreeMem[ret].weight=0.0;
00507 return ret ;
00508 }
00509
00511 inline void check_treemem()
00512 {
00513 if (TreeMemPtr+10 < TreeMemPtrMax)
00514 return;
00515 SG_DEBUG( "Extending TreeMem from %i to %i elements\n",
00516 TreeMemPtrMax, (int32_t) ((float64_t)TreeMemPtrMax*1.2));
00517 TreeMemPtrMax = (int32_t) ((float64_t)TreeMemPtrMax*1.2);
00518 TreeMem = (Trie*) realloc(TreeMem,
00519 TreeMemPtrMax*sizeof(Trie));
00520 if (!TreeMem)
00521 SG_ERROR( "out of memory\n");
00522 }
00523
00528 inline void set_weights_in_tree(bool weights_in_tree_)
00529 {
00530 weights_in_tree = weights_in_tree_;
00531 }
00532
00537 inline bool get_weights_in_tree()
00538 {
00539 return weights_in_tree;
00540 }
00541
00551 void POIMs_extract_W_helper(
00552 const int32_t nodeIdx, const int32_t depth, const int32_t offset,
00553 const int32_t y0, float64_t* const* const W, const int32_t K);
00554
00567 void POIMs_calc_SLR_helper1(
00568 const float64_t* const distrib, const int32_t i,
00569 const int32_t nodeIdx, int32_t left_tries_idx[4],
00570 const int32_t depth, int32_t const lastSym, float64_t* S,
00571 float64_t* L, float64_t* R);
00572
00573
00584 void POIMs_calc_SLR_helper2(
00585 const float64_t* const distrib, const int32_t i,
00586 const int32_t nodeIdx, int32_t left_tries_idx[4],
00587 const int32_t depth, float64_t* S, float64_t* L, float64_t* R);
00588
00599 void POIMs_add_SLR_helper1(
00600 const int32_t nodeIdx, const int32_t depth,const int32_t i,
00601 const int32_t y0, float64_t* const* const poims, const int32_t K,
00602 const int32_t debug);
00603
00617 void POIMs_add_SLR_helper2(
00618 float64_t* const* const poims, const int32_t K, const int32_t k,
00619 const int32_t i, const int32_t y, const float64_t valW,
00620 const float64_t valS, const float64_t valL, const float64_t valR,
00621 const int32_t debug);
00622
00624 inline virtual const char* get_name() const { return "Trie"; }
00625
00626 public:
00628 int32_t NUM_SYMS;
00629
00630 protected:
00632 int32_t length;
00634 int32_t * trees;
00635
00637 int32_t degree;
00639 float64_t const * position_weights;
00640
00642 Trie* TreeMem;
00644 int32_t TreeMemPtr;
00646 int32_t TreeMemPtrMax;
00648 bool use_compact_terminal_nodes;
00649
00651 bool weights_in_tree;
00652
00654 int32_t* nofsKmers;
00655 };
00656
00657 template <class Trie>
00658 CTrie<Trie>::CTrie(int32_t d, bool p_use_compact_terminal_nodes)
00659 : CSGObject(), degree(d), position_weights(NULL),
00660 use_compact_terminal_nodes(p_use_compact_terminal_nodes),
00661 weights_in_tree(true)
00662 {
00663 TreeMemPtrMax=1024*1024/sizeof(Trie);
00664 TreeMemPtr=0;
00665 TreeMem=(Trie*)malloc(TreeMemPtrMax*sizeof(Trie));
00666
00667 length=0;
00668 trees=NULL;
00669
00670 NUM_SYMS=4;
00671 }
00672
00673 template <class Trie>
00674 CTrie<Trie>::CTrie(const CTrie & to_copy)
00675 : CSGObject(to_copy), degree(to_copy.degree), position_weights(NULL),
00676 use_compact_terminal_nodes(to_copy.use_compact_terminal_nodes)
00677 {
00678 if (to_copy.position_weights!=NULL)
00679 {
00680 position_weights = to_copy.position_weights;
00681
00682
00683
00684 }
00685 else
00686 position_weights=NULL;
00687
00688 TreeMemPtrMax=to_copy.TreeMemPtrMax;
00689 TreeMemPtr=to_copy.TreeMemPtr;
00690 TreeMem=(Trie*)malloc(TreeMemPtrMax*sizeof(Trie));
00691 memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie));
00692
00693 length=to_copy.length;
00694 trees=new int32_t[length];
00695 for (int32_t i=0; i<length; i++)
00696 trees[i]=to_copy.trees[i];
00697
00698 NUM_SYMS=4;
00699 }
00700
00701 template <class Trie>
00702 const CTrie<Trie> &CTrie<Trie>::operator=(const CTrie<Trie> & to_copy)
00703 {
00704 degree=to_copy.degree ;
00705 use_compact_terminal_nodes=to_copy.use_compact_terminal_nodes ;
00706
00707 delete[] position_weights ;
00708 position_weights=NULL ;
00709 if (to_copy.position_weights!=NULL)
00710 {
00711 position_weights=to_copy.position_weights ;
00712
00713
00714
00715 }
00716 else
00717 position_weights=NULL ;
00718
00719 TreeMemPtrMax=to_copy.TreeMemPtrMax ;
00720 TreeMemPtr=to_copy.TreeMemPtr ;
00721 free(TreeMem) ;
00722 TreeMem = (Trie*)malloc(TreeMemPtrMax*sizeof(Trie)) ;
00723 memcpy(TreeMem, to_copy.TreeMem, TreeMemPtrMax*sizeof(Trie)) ;
00724
00725 length = to_copy.length ;
00726 if (trees)
00727 delete[] trees ;
00728 trees=new int32_t[length] ;
00729 for (int32_t i=0; i<length; i++)
00730 trees[i]=to_copy.trees[i] ;
00731
00732 return *this ;
00733 }
00734
00735 template <class Trie>
00736 int32_t CTrie<Trie>::find_deepest_node(
00737 int32_t start_node, int32_t& deepest_node) const
00738 {
00739 #ifdef TRIE_CHECK_EVERYTHING
00740 int32_t ret=0 ;
00741 SG_DEBUG("start_node=%i\n", start_node) ;
00742
00743 if (start_node==NO_CHILD)
00744 {
00745 for (int32_t i=0; i<length; i++)
00746 {
00747 int32_t my_deepest_node ;
00748 int32_t depth=find_deepest_node(i, my_deepest_node) ;
00749 SG_DEBUG("start_node %i depth=%i\n", i, depth) ;
00750 if (depth>ret)
00751 {
00752 deepest_node=my_deepest_node ;
00753 ret=depth ;
00754 }
00755 }
00756 return ret ;
00757 }
00758
00759 if (TreeMem[start_node].has_seq)
00760 {
00761 for (int32_t q=0; q<16; q++)
00762 if (TreeMem[start_node].seq[q]!=TRIE_TERMINAL_CHARACTER)
00763 ret++ ;
00764 deepest_node=start_node ;
00765 return ret ;
00766 }
00767 if (TreeMem[start_node].has_floats)
00768 {
00769 deepest_node=start_node ;
00770 return 1 ;
00771 }
00772
00773 for (int32_t q=0; q<4; q++)
00774 {
00775 int32_t my_deepest_node ;
00776 if (TreeMem[start_node].children[q]==NO_CHILD)
00777 continue ;
00778 int32_t depth=find_deepest_node(abs(TreeMem[start_node].children[q]), my_deepest_node) ;
00779 if (depth>ret)
00780 {
00781 deepest_node=my_deepest_node ;
00782 ret=depth ;
00783 }
00784 }
00785 return ret ;
00786 #else
00787 SG_ERROR( "not implemented\n") ;
00788 return 0 ;
00789 #endif
00790 }
00791
00792 template <class Trie>
00793 int32_t CTrie<Trie>::compact_nodes(
00794 int32_t start_node, int32_t depth, float64_t * weights)
00795 {
00796 SG_ERROR( "code buggy\n") ;
00797
00798 int32_t ret=0 ;
00799
00800 if (start_node==NO_CHILD)
00801 {
00802 for (int32_t i=0; i<length; i++)
00803 compact_nodes(i,1, weights) ;
00804 return 0 ;
00805 }
00806 if (start_node<0)
00807 return -1 ;
00808
00809 if (depth==degree-1)
00810 {
00811 TRIE_ASSERT_EVERYTHING(TreeMem[start_node].has_floats) ;
00812 int32_t num_used=0 ;
00813 for (int32_t q=0; q<4; q++)
00814 if (TreeMem[start_node].child_weights[q]!=0.0)
00815 num_used++ ;
00816 if (num_used>1)
00817 return -1 ;
00818 return 1 ;
00819 }
00820 TRIE_ASSERT_EVERYTHING(!TreeMem[start_node].has_floats) ;
00821
00822 int32_t num_used = 0 ;
00823 int32_t q_used=-1 ;
00824
00825 for (int32_t q=0; q<4; q++)
00826 {
00827 if (TreeMem[start_node].children[q]==NO_CHILD)
00828 continue ;
00829 num_used++ ;
00830 q_used=q ;
00831 }
00832 if (num_used>1)
00833 {
00834 if (depth>=degree-2)
00835 return -1 ;
00836 for (int32_t q=0; q<4; q++)
00837 {
00838 if (TreeMem[start_node].children[q]==NO_CHILD)
00839 continue ;
00840 int32_t num=compact_nodes(abs(TreeMem[start_node].children[q]), depth+1, weights) ;
00841 if (num<=2)
00842 continue ;
00843 int32_t node=get_node() ;
00844
00845 int32_t last_node=TreeMem[start_node].children[q] ;
00846 if (weights_in_tree)
00847 {
00848 ASSERT(weights[depth]!=0.0) ;
00849 TreeMem[node].weight=TreeMem[last_node].weight/weights[depth] ;
00850 }
00851 else
00852 TreeMem[node].weight=TreeMem[last_node].weight ;
00853
00854 #ifdef TRIE_CHECK_EVERYTHING
00855 TreeMem[node].has_seq=true ;
00856 #endif
00857 memset(TreeMem[node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
00858 for (int32_t n=0; n<num; n++)
00859 {
00860 ASSERT(depth+n+1<=degree-1) ;
00861 ASSERT(last_node!=NO_CHILD) ;
00862 if (depth+n+1==degree-1)
00863 {
00864 TRIE_ASSERT_EVERYTHING(TreeMem[last_node].has_floats) ;
00865 int32_t k ;
00866 for (k=0; k<4; k++)
00867 if (TreeMem[last_node].child_weights[k]!=0.0)
00868 break ;
00869 if (k==4)
00870 break ;
00871 TreeMem[node].seq[n]=k ;
00872 break ;
00873 }
00874 else
00875 {
00876 TRIE_ASSERT_EVERYTHING(!TreeMem[last_node].has_floats) ;
00877 int32_t k ;
00878 for (k=0; k<4; k++)
00879 if (TreeMem[last_node].children[k]!=NO_CHILD)
00880 break ;
00881 if (k==4)
00882 break ;
00883 TreeMem[node].seq[n]=k ;
00884 last_node=TreeMem[last_node].children[k] ;
00885 }
00886 }
00887 TreeMem[start_node].children[q]=-node ;
00888 }
00889 return -1 ;
00890 }
00891 if (num_used==0)
00892 return 0 ;
00893
00894 ret=compact_nodes(abs(TreeMem[start_node].children[q_used]), depth+1, weights) ;
00895 if (ret<0)
00896 return ret ;
00897 return ret+1 ;
00898 }
00899
00900
00901 template <class Trie>
00902 bool CTrie<Trie>::compare_traverse(
00903 int32_t node, const CTrie<Trie> & other, int32_t other_node)
00904 {
00905 SG_DEBUG("checking nodes %i and %i\n", node, other_node) ;
00906 if (fabs(TreeMem[node].weight-other.TreeMem[other_node].weight)>=1e-5)
00907 {
00908 SG_DEBUG( "CTrie::compare: TreeMem[%i].weight=%f!=other.TreeMem[%i].weight=%f\n", node, TreeMem[node].weight, other_node,other.TreeMem[other_node].weight) ;
00909 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00910 display_node(node) ;
00911 SG_DEBUG( "============================================================\n") ;
00912 other.display_node(other_node) ;
00913 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00914 return false ;
00915 }
00916
00917 #ifdef TRIE_CHECK_EVERYTHING
00918 if (TreeMem[node].has_seq!=other.TreeMem[other_node].has_seq)
00919 {
00920 SG_DEBUG( "CTrie::compare: TreeMem[%i].has_seq=%i!=other.TreeMem[%i].has_seq=%i\n", node, TreeMem[node].has_seq, other_node,other.TreeMem[other_node].has_seq) ;
00921 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00922 display_node(node) ;
00923 SG_DEBUG( "============================================================\n") ;
00924 other.display_node(other_node) ;
00925 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00926 return false ;
00927 }
00928 if (TreeMem[node].has_floats!=other.TreeMem[other_node].has_floats)
00929 {
00930 SG_DEBUG( "CTrie::compare: TreeMem[%i].has_floats=%i!=other.TreeMem[%i].has_floats=%i\n", node, TreeMem[node].has_floats, other_node, other.TreeMem[other_node].has_floats) ;
00931 return false ;
00932 }
00933 if (other.TreeMem[other_node].has_floats)
00934 {
00935 for (int32_t q=0; q<4; q++)
00936 if (fabs(TreeMem[node].child_weights[q]-other.TreeMem[other_node].child_weights[q])>1e-5)
00937 {
00938 SG_DEBUG( "CTrie::compare: TreeMem[%i].child_weights[%i]=%e!=other.TreeMem[%i].child_weights[%i]=%e\n", node, q,TreeMem[node].child_weights[q], other_node,q,other.TreeMem[other_node].child_weights[q]) ;
00939 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00940 display_node(node) ;
00941 SG_DEBUG( "============================================================\n") ;
00942 other.display_node(other_node) ;
00943 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00944 return false ;
00945 }
00946 }
00947 if (other.TreeMem[other_node].has_seq)
00948 {
00949 for (int32_t q=0; q<16; q++)
00950 if ((TreeMem[node].seq[q]!=other.TreeMem[other_node].seq[q]) && ((TreeMem[node].seq[q]<4)||(other.TreeMem[other_node].seq[q]<4)))
00951 {
00952 SG_DEBUG( "CTrie::compare: TreeMem[%i].seq[%i]=%i!=other.TreeMem[%i].seq[%i]=%i\n", node,q,TreeMem[node].seq[q], other_node,q,other.TreeMem[other_node].seq[q]) ;
00953 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00954 display_node(node) ;
00955 SG_DEBUG( "============================================================\n") ;
00956 other.display_node(other_node) ;
00957 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00958 return false ;
00959 }
00960 }
00961 if (!other.TreeMem[other_node].has_seq && !other.TreeMem[other_node].has_floats)
00962 {
00963 for (int32_t q=0; q<4; q++)
00964 {
00965 if ((TreeMem[node].children[q]==NO_CHILD) && (other.TreeMem[other_node].children[q]==NO_CHILD))
00966 continue ;
00967 if ((TreeMem[node].children[q]==NO_CHILD)!=(other.TreeMem[other_node].children[q]==NO_CHILD))
00968 {
00969 SG_DEBUG( "CTrie::compare: TreeMem[%i].children[%i]=%i!=other.TreeMem[%i].children[%i]=%i\n", node,q,TreeMem[node].children[q], other_node,q,other.TreeMem[other_node].children[q]) ;
00970 SG_DEBUG( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n") ;
00971 display_node(node) ;
00972 SG_DEBUG( "============================================================\n") ;
00973 other.display_node(other_node) ;
00974 SG_DEBUG( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n") ;
00975 return false ;
00976 }
00977 if (!compare_traverse(abs(TreeMem[node].children[q]), other, abs(other.TreeMem[other_node].children[q])))
00978 return false ;
00979 }
00980 }
00981 #else
00982 SG_ERROR( "not implemented\n") ;
00983 #endif
00984
00985 return true ;
00986 }
00987
00988 template <class Trie>
00989 bool CTrie<Trie>::compare(const CTrie<Trie> & other)
00990 {
00991 bool ret=true ;
00992 for (int32_t i=0; i<length; i++)
00993 if (!compare_traverse(trees[i], other, other.trees[i]))
00994 return false ;
00995 else
00996 SG_DEBUG("two tries at %i identical\n", i) ;
00997
00998 return ret ;
00999 }
01000
01001 template <class Trie>
01002 bool CTrie<Trie>::find_node(
01003 int32_t node, int32_t * trace, int32_t& trace_len) const
01004 {
01005 #ifdef TRIE_CHECK_EVERYTHING
01006 ASSERT(trace_len-1>=0) ;
01007 ASSERT((trace[trace_len-1]>=0) && (trace[trace_len-1]<TreeMemPtrMax))
01008 if (TreeMem[trace[trace_len-1]].has_seq)
01009 return false ;
01010 if (TreeMem[trace[trace_len-1]].has_floats)
01011 return false ;
01012
01013 for (int32_t q=0; q<4; q++)
01014 {
01015 if (TreeMem[trace[trace_len-1]].children[q]==NO_CHILD)
01016 continue ;
01017 int32_t tl=trace_len+1 ;
01018 if (TreeMem[trace[trace_len-1]].children[q]>=0)
01019 trace[trace_len]=TreeMem[trace[trace_len-1]].children[q] ;
01020 else
01021 trace[trace_len]=-TreeMem[trace[trace_len-1]].children[q] ;
01022
01023 if (trace[trace_len]==node)
01024 {
01025 trace_len=tl ;
01026 return true ;
01027 }
01028 if (find_node(node, trace, tl))
01029 {
01030 trace_len=tl ;
01031 return true ;
01032 }
01033 }
01034 trace_len=0 ;
01035 return false ;
01036 #else
01037 SG_ERROR( "not implemented\n") ;
01038 return false ;
01039 #endif
01040 }
01041
01042 template <class Trie>
01043 void CTrie<Trie>::display_node(int32_t node) const
01044 {
01045 #ifdef TRIE_CHECK_EVERYTHING
01046 int32_t * trace=new int32_t[2*degree] ;
01047 int32_t trace_len=-1 ;
01048 bool found = false ;
01049 int32_t tree=-1 ;
01050 for (tree=0; tree<length; tree++)
01051 {
01052 trace[0]=trees[tree] ;
01053 trace_len=1 ;
01054 found=find_node(node, trace, trace_len) ;
01055 if (found)
01056 break ;
01057 }
01058 ASSERT(found) ;
01059 SG_PRINT( "position %i trace: ", tree) ;
01060
01061 for (int32_t i=0; i<trace_len-1; i++)
01062 {
01063 int32_t branch=-1 ;
01064 for (int32_t q=0; q<4; q++)
01065 if (abs(TreeMem[trace[i]].children[q])==trace[i+1])
01066 {
01067 branch=q;
01068 break ;
01069 }
01070 ASSERT(branch!=-1) ;
01071 char acgt[5]="ACGT" ;
01072 SG_PRINT( "%c", acgt[branch]) ;
01073 }
01074 SG_PRINT( "\nnode=%i\nweight=%f\nhas_seq=%i\nhas_floats=%i\n", node, TreeMem[node].weight, TreeMem[node].has_seq, TreeMem[node].has_floats) ;
01075 if (TreeMem[node].has_floats)
01076 {
01077 for (int32_t q=0; q<4; q++)
01078 SG_PRINT( "child_weighs[%i] = %f\n", q, TreeMem[node].child_weights[q]) ;
01079 }
01080 if (TreeMem[node].has_seq)
01081 {
01082 for (int32_t q=0; q<16; q++)
01083 SG_PRINT( "seq[%i] = %i\n", q, TreeMem[node].seq[q]) ;
01084 }
01085 if (!TreeMem[node].has_seq && !TreeMem[node].has_floats)
01086 {
01087 for (int32_t q=0; q<4; q++)
01088 {
01089 if (TreeMem[node].children[q]!=NO_CHILD)
01090 {
01091 SG_PRINT( "children[%i] = %i -> \n", q, TreeMem[node].children[q]) ;
01092 display_node(abs(TreeMem[node].children[q])) ;
01093 }
01094 else
01095 SG_PRINT( "children[%i] = NO_CHILD -| \n", q, TreeMem[node].children[q]) ;
01096 }
01097
01098 }
01099
01100 delete[] trace ;
01101 #else
01102 SG_ERROR( "not implemented\n") ;
01103 #endif
01104 }
01105
01106
01107 template <class Trie> CTrie<Trie>::~CTrie()
01108 {
01109 destroy() ;
01110
01111 free(TreeMem) ;
01112 }
01113
01114 template <class Trie> void CTrie<Trie>::destroy()
01115 {
01116 if (trees!=NULL)
01117 {
01118 delete_trees();
01119 for (int32_t i=0; i<length; i++)
01120 trees[i] = NO_CHILD;
01121 delete[] trees;
01122
01123 TreeMemPtr=0;
01124 length=0;
01125 trees=NULL;
01126 }
01127 }
01128
01129 template <class Trie> void CTrie<Trie>::set_degree(int32_t d)
01130 {
01131 delete_trees(get_use_compact_terminal_nodes());
01132 degree=d;
01133 }
01134
01135 template <class Trie> void CTrie<Trie>::create(
01136 int32_t len, bool p_use_compact_terminal_nodes)
01137 {
01138 destroy();
01139
01140 trees=new int32_t[len] ;
01141 TreeMemPtr=0 ;
01142 for (int32_t i=0; i<len; i++)
01143 trees[i]=get_node(degree==1);
01144 length = len ;
01145
01146 use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
01147 }
01148
01149
01150 template <class Trie> void CTrie<Trie>::delete_trees(
01151 bool p_use_compact_terminal_nodes)
01152 {
01153 if (trees==NULL)
01154 return;
01155
01156 TreeMemPtr=0 ;
01157 for (int32_t i=0; i<length; i++)
01158 trees[i]=get_node(degree==1);
01159
01160 use_compact_terminal_nodes=p_use_compact_terminal_nodes ;
01161 }
01162
01163 template <class Trie>
01164 float64_t CTrie<Trie>::compute_abs_weights_tree(int32_t tree, int32_t depth)
01165 {
01166 float64_t ret=0 ;
01167
01168 if (tree==NO_CHILD)
01169 return 0 ;
01170 TRIE_ASSERT(tree>=0) ;
01171
01172 if (depth==degree-2)
01173 {
01174 ret+=(TreeMem[tree].weight) ;
01175
01176 for (int32_t k=0; k<4; k++)
01177 ret+=(TreeMem[tree].child_weights[k]) ;
01178
01179 return ret ;
01180 }
01181
01182 ret+=(TreeMem[tree].weight) ;
01183
01184 for (int32_t i=0; i<4; i++)
01185 if (TreeMem[tree].children[i]!=NO_CHILD)
01186 ret += compute_abs_weights_tree(TreeMem[tree].children[i], depth+1) ;
01187
01188 return ret ;
01189 }
01190
01191
01192 template <class Trie>
01193 float64_t *CTrie<Trie>::compute_abs_weights(int32_t &len)
01194 {
01195 float64_t * sum=new float64_t[length*4] ;
01196 for (int32_t i=0; i<length*4; i++)
01197 sum[i]=0 ;
01198 len=length ;
01199
01200 for (int32_t i=0; i<length; i++)
01201 {
01202 TRIE_ASSERT(trees[i]!=NO_CHILD) ;
01203 for (int32_t k=0; k<4; k++)
01204 {
01205 sum[i*4+k]=compute_abs_weights_tree(TreeMem[trees[i]].children[k], 0) ;
01206 }
01207 }
01208
01209 return sum ;
01210 }
01211
01212 template <class Trie>
01213 void CTrie<Trie>::add_example_to_tree_mismatch_recursion(
01214 int32_t tree, int32_t i, float64_t alpha,
01215 int32_t *vec, int32_t len_rem,
01216 int32_t degree_rec, int32_t mismatch_rec,
01217 int32_t max_mismatch, float64_t * weights)
01218 {
01219 if (tree==NO_CHILD)
01220 tree=trees[i] ;
01221 TRIE_ASSERT(tree!=NO_CHILD) ;
01222
01223 if ((len_rem<=0) || (mismatch_rec>max_mismatch) || (degree_rec>degree))
01224 return ;
01225 const int32_t other[4][3] = { {1,2,3},{0,2,3},{0,1,3},{0,1,2} } ;
01226
01227 int32_t subtree = NO_CHILD ;
01228
01229 if (degree_rec==degree-1)
01230 {
01231 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
01232 if (weights_in_tree)
01233 TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec];
01234 else
01235 if (weights[degree_rec]!=0.0)
01236 TreeMem[tree].child_weights[vec[0]] += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
01237 if (mismatch_rec+1<=max_mismatch)
01238 for (int32_t o=0; o<3; o++)
01239 {
01240 if (weights_in_tree)
01241 TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
01242 else
01243 if (weights[degree_rec]!=0.0)
01244 TreeMem[tree].child_weights[other[vec[0]][o]] += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
01245 }
01246 return ;
01247 }
01248 else
01249 {
01250 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01251 if (TreeMem[tree].children[vec[0]]!=NO_CHILD)
01252 {
01253 subtree=TreeMem[tree].children[vec[0]] ;
01254 if (weights_in_tree)
01255 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec];
01256 else
01257 if (weights[degree_rec]!=0.0)
01258 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec];
01259 }
01260 else
01261 {
01262 int32_t tmp = get_node(degree_rec==degree-2);
01263 ASSERT(tmp>=0) ;
01264 TreeMem[tree].children[vec[0]]=tmp ;
01265 subtree=tmp ;
01266 #ifdef TRIE_CHECK_EVERYTHING
01267 if (degree_rec==degree-2)
01268 TreeMem[subtree].has_floats=true ;
01269 #endif
01270 if (weights_in_tree)
01271 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec] ;
01272 else
01273 if (weights[degree_rec]!=0.0)
01274 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*mismatch_rec]/weights[degree_rec] ;
01275 else
01276 TreeMem[subtree].weight = 0.0 ;
01277 }
01278 add_example_to_tree_mismatch_recursion(subtree, i, alpha,
01279 &vec[1], len_rem-1,
01280 degree_rec+1, mismatch_rec, max_mismatch, weights) ;
01281
01282 if (mismatch_rec+1<=max_mismatch)
01283 {
01284 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01285 for (int32_t o=0; o<3; o++)
01286 {
01287 int32_t ot = other[vec[0]][o] ;
01288 if (TreeMem[tree].children[ot]!=NO_CHILD)
01289 {
01290 subtree=TreeMem[tree].children[ot] ;
01291 if (weights_in_tree)
01292 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)];
01293 else
01294 if (weights[degree_rec]!=0.0)
01295 TreeMem[subtree].weight += alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec];
01296 }
01297 else
01298 {
01299 int32_t tmp = get_node(degree_rec==degree-2);
01300 ASSERT(tmp>=0) ;
01301 TreeMem[tree].children[ot]=tmp ;
01302 subtree=tmp ;
01303 #ifdef TRIE_CHECK_EVERYTHING
01304 if (degree_rec==degree-2)
01305 TreeMem[subtree].has_floats=true ;
01306 #endif
01307
01308 if (weights_in_tree)
01309 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)] ;
01310 else
01311 if (weights[degree_rec]!=0.0)
01312 TreeMem[subtree].weight = alpha*weights[degree_rec+degree*(mismatch_rec+1)]/weights[degree_rec] ;
01313 else
01314 TreeMem[subtree].weight = 0.0 ;
01315 }
01316
01317 add_example_to_tree_mismatch_recursion(subtree, i, alpha,
01318 &vec[1], len_rem-1,
01319 degree_rec+1, mismatch_rec+1, max_mismatch, weights) ;
01320 }
01321 }
01322 }
01323 }
01324
01325 template <class Trie>
01326 void CTrie<Trie>::compute_scoring_helper(
01327 int32_t tree, int32_t i, int32_t j, float64_t weight, int32_t d,
01328 int32_t max_degree, int32_t num_feat, int32_t num_sym, int32_t sym_offset,
01329 int32_t offs, float64_t* result)
01330 {
01331 if (i+j<num_feat)
01332 {
01333 float64_t decay=1.0;
01334
01335
01336
01337 if (j<degree-1)
01338 {
01339 for (int32_t k=0; k<num_sym; k++)
01340 {
01341 if (TreeMem[tree].children[k]!=NO_CHILD)
01342 {
01343 int32_t child=TreeMem[tree].children[k];
01344
01345 if (d<max_degree-1)
01346 compute_scoring_helper(child, i, j+1, weight+decay*TreeMem[child].weight, d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
01347 else
01348 result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[child].weight;
01349
01351 if (d==0)
01352 compute_scoring_helper(child, i, j+1, 0.0, 0, max_degree, num_feat, num_sym, sym_offset, offs, result);
01353 }
01354 }
01355 }
01356 else if (j==degree-1)
01357 {
01358 for (int32_t k=0; k<num_sym; k++)
01359 {
01360
01361 if (d<max_degree-1 && i<num_feat-1)
01362 compute_scoring_helper(trees[i+1], i+1, 0, weight+decay*TreeMem[tree].child_weights[k], d+1, max_degree, num_feat, num_sym, sym_offset, num_sym*offs+k, result);
01363 else
01364 result[sym_offset*(i+j-max_degree+1)+num_sym*offs+k] += weight+decay*TreeMem[tree].child_weights[k];
01365 }
01366 }
01367 }
01368 }
01369
01370 template <class Trie>
01371 void CTrie<Trie>::traverse(
01372 int32_t tree, const int32_t p, struct TreeParseInfo info,
01373 const int32_t depth, int32_t* const x, const int32_t k)
01374 {
01375 const int32_t num_sym = info.num_sym;
01376 const int32_t y0 = info.y0;
01377 const int32_t y1 = (k==0) ? 0 : y0 - ( (depth<k) ? 0 : info.nofsKmers[k-1] * x[depth-k] );
01378
01379
01380
01381
01382
01383 int32_t sym;
01384 ASSERT( depth < degree );
01385
01386 if (depth<degree-1)
01387 {
01388 for( sym=0; sym<num_sym; ++sym ) {
01389 const int32_t childNum = TreeMem[tree].children[ sym ];
01390 if( childNum != NO_CHILD ) {
01391 int32_t child = childNum ;
01392 x[depth] = sym;
01393 info.substrs[depth+1] = y0 + sym;
01394 info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
01395
01396 count( TreeMem[child].weight, depth, info, p, x, k );
01397 traverse( child, p, info, depth+1, x, k );
01398 x[depth] = -1;
01399 }
01400 }
01401 }
01402 else if( depth == degree-1 )
01403 {
01404 for( sym=0; sym<num_sym; ++sym ) {
01405 const float64_t w = TreeMem[tree].child_weights[ sym ];
01406 if( w != 0.0 ) {
01407 x[depth] = sym;
01408 info.substrs[depth+1] = y0 + sym;
01409 info.y0 = (k==0) ? 0 : (y1+sym)*num_sym;
01410
01411 count( w, depth, info, p, x, k );
01412 x[depth] = -1;
01413 }
01414 }
01415 }
01416
01417
01418 }
01419
01420 template <class Trie>
01421 void CTrie<Trie>::count(
01422 const float64_t w, const int32_t depth, const struct TreeParseInfo info,
01423 const int32_t p, int32_t* x, const int32_t k)
01424 {
01425 ASSERT( fabs(w) < 1e10 );
01426 ASSERT( x[depth] >= 0 );
01427 ASSERT( x[depth+1] < 0 );
01428 if ( depth < k ) {
01429 return;
01430 }
01431
01432 const int32_t nofKmers = info.nofsKmers[k];
01433 const float64_t margWeight = w * info.margFactors[ depth-k ];
01434 const int32_t m_a = depth - k + 1;
01435 const int32_t m_b = info.num_feat - p;
01436 const int32_t m = ( m_a < m_b ) ? m_a : m_b;
01437
01438 const int32_t offset0 = nofKmers * p;
01439 register int32_t i;
01440 register int32_t offset;
01441 offset = offset0;
01442 for( i = 0; i < m; ++i ) {
01443 const int32_t y = info.substrs[i+k+1];
01444 info.C_k[ y + offset ] += margWeight;
01445 offset += nofKmers;
01446 }
01447 if( depth > k ) {
01448
01449 const int32_t offsR = info.substrs[k+1] + offset0;
01450 info.R_k[offsR] += margWeight;
01451
01452 if( p+depth-k < info.num_feat ) {
01453 const int32_t offsL = info.substrs[depth+1] + nofKmers * (p+depth-k);
01454 info.L_k[offsL] += margWeight;
01455 }
01456 }
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466
01467
01468
01469
01470
01471
01472 }
01473
01474 template <class Trie>
01475 void CTrie<Trie>::add_to_trie(
01476 int32_t i, int32_t seq_offset, int32_t * vec, float32_t alpha,
01477 float64_t *weights, bool degree_times_position_weights)
01478 {
01479 int32_t tree = trees[i] ;
01480
01481
01482 int32_t max_depth = 0 ;
01483 float64_t* weights_column ;
01484 if (degree_times_position_weights)
01485 weights_column = &weights[(i+seq_offset)*degree] ;
01486 else
01487 weights_column = weights ;
01488
01489 if (weights_in_tree)
01490 {
01491 for (int32_t j=0; (j<degree) && (i+j<length); j++)
01492 if (CMath::abs(weights_column[j]*alpha)>0)
01493 max_depth = j+1 ;
01494 }
01495 else
01496
01497 max_depth=degree ;
01498
01499 for (int32_t j=0; (j<max_depth) && (i+j+seq_offset<length); j++)
01500 {
01501 TRIE_ASSERT((vec[i+j+seq_offset]>=0) && (vec[i+j+seq_offset]<4)) ;
01502 if ((j<degree-1) && (TreeMem[tree].children[vec[i+j+seq_offset]]!=NO_CHILD))
01503 {
01504 if (TreeMem[tree].children[vec[i+j+seq_offset]]<0)
01505 {
01506
01507 TRIE_ASSERT(j >= degree-16) ;
01508
01509 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01510 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01511 int32_t node = - TreeMem[tree].children[vec[i+j+seq_offset]] ;
01512
01513 TRIE_ASSERT((node>=0) && (node<=TreeMemPtrMax)) ;
01514 TRIE_ASSERT_EVERYTHING(TreeMem[node].has_seq) ;
01515 TRIE_ASSERT_EVERYTHING(!TreeMem[node].has_floats) ;
01516
01517
01518 int32_t mismatch_pos = -1 ;
01519 {
01520 int32_t k ;
01521 for (k=0; (j+k<max_depth) && (i+j+seq_offset+k<length); k++)
01522 {
01523 TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4)) ;
01524
01525 if ((TreeMem[node].seq[k]>=4) && (TreeMem[node].seq[k]!=TRIE_TERMINAL_CHARACTER))
01526 fprintf(stderr, "+++i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[k]) ;
01527 TRIE_ASSERT((TreeMem[node].seq[k]<4) || (TreeMem[node].seq[k]==TRIE_TERMINAL_CHARACTER)) ;
01528 TRIE_ASSERT(k<16) ;
01529 if (TreeMem[node].seq[k]!=vec[i+j+seq_offset+k])
01530 {
01531 mismatch_pos=k ;
01532 break ;
01533 }
01534 }
01535 }
01536
01537
01538
01539 if (mismatch_pos==-1)
01540
01541 TreeMem[node].weight+=alpha ;
01542 else
01543
01544
01545
01546
01547 {
01548
01549 int32_t last_node=tree ;
01550
01551
01552 int32_t k ;
01553 for (k=0; k<mismatch_pos; k++)
01554 {
01555 TRIE_ASSERT((vec[i+j+seq_offset+k]>=0) && (vec[i+j+seq_offset+k]<4)) ;
01556 TRIE_ASSERT(vec[i+j+seq_offset+k]==TreeMem[node].seq[k]) ;
01557
01558 int32_t tmp=get_node();
01559 TreeMem[last_node].children[vec[i+j+seq_offset+k]]=tmp ;
01560 last_node=tmp ;
01561 if (weights_in_tree)
01562 TreeMem[last_node].weight = (TreeMem[node].weight+alpha)*weights_column[j+k] ;
01563 else
01564 TreeMem[last_node].weight = (TreeMem[node].weight+alpha) ;
01565 TRIE_ASSERT(j+k!=degree-1) ;
01566 }
01567 if ((TreeMem[node].seq[mismatch_pos]>=4) && (TreeMem[node].seq[mismatch_pos]!=TRIE_TERMINAL_CHARACTER))
01568 fprintf(stderr, "**i=%i j=%i seq[%i]=%i\n", i, j, k, TreeMem[node].seq[mismatch_pos]) ;
01569 ASSERT((TreeMem[node].seq[mismatch_pos]<4) || (TreeMem[node].seq[mismatch_pos]==TRIE_TERMINAL_CHARACTER)) ;
01570 TRIE_ASSERT(vec[i+j+seq_offset+mismatch_pos]!=TreeMem[node].seq[mismatch_pos]) ;
01571
01572 if (j+k==degree-1)
01573 {
01574
01575
01576
01577 for (int32_t q=0; q<4; q++)
01578 TreeMem[last_node].child_weights[q]=0.0 ;
01579 if (weights_in_tree)
01580 {
01581 if (TreeMem[node].seq[mismatch_pos]<4)
01582 TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]+=TreeMem[node].weight*weights_column[degree-1] ;
01583 TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] += alpha*weights_column[degree-1] ;
01584 }
01585 else
01586 {
01587 if (TreeMem[node].seq[mismatch_pos]<4)
01588 TreeMem[last_node].child_weights[TreeMem[node].seq[mismatch_pos]]=TreeMem[node].weight ;
01589 TreeMem[last_node].child_weights[vec[i+j+seq_offset+k]] = alpha ;
01590 }
01591
01592 #ifdef TRIE_CHECK_EVERYTHING
01593 TreeMem[last_node].has_floats=true ;
01594 #endif
01595 }
01596 else
01597 {
01598
01599 if (TreeMem[node].seq[mismatch_pos]<4)
01600 {
01601 TreeMem[last_node].children[TreeMem[node].seq[mismatch_pos]] = -node ;
01602
01603
01604 for (int32_t q=0; q<16; q++)
01605 {
01606 if ((j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length))
01607 TreeMem[node].seq[q] = TreeMem[node].seq[q+mismatch_pos] ;
01608 else
01609 TreeMem[node].seq[q] = TRIE_TERMINAL_CHARACTER ;
01610 }
01611 #ifdef TRIE_CHECK_EVERYTHING
01612 TreeMem[node].has_seq=true ;
01613 #endif
01614 }
01615
01616
01617 TRIE_ASSERT((vec[i+j+seq_offset+mismatch_pos]>=0) && (vec[i+j+seq_offset+mismatch_pos]<4)) ;
01618 int32_t tmp = get_node() ;
01619 TreeMem[last_node].children[vec[i+j+seq_offset+mismatch_pos]] = -tmp ;
01620 last_node=tmp ;
01621
01622 TreeMem[last_node].weight = alpha ;
01623 #ifdef TRIE_CHECK_EVERYTHING
01624 TreeMem[last_node].has_seq = true ;
01625 #endif
01626 memset(TreeMem[last_node].seq, TRIE_TERMINAL_CHARACTER, 16) ;
01627 for (int32_t q=0; (j+q+mismatch_pos<degree) && (i+j+seq_offset+q+mismatch_pos<length); q++)
01628 TreeMem[last_node].seq[q] = vec[i+j+seq_offset+mismatch_pos+q] ;
01629 }
01630 }
01631 break ;
01632 }
01633 else
01634 {
01635 tree=TreeMem[tree].children[vec[i+j+seq_offset]] ;
01636 TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax)) ;
01637 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01638 if (weights_in_tree)
01639 TreeMem[tree].weight += alpha*weights_column[j];
01640 else
01641 TreeMem[tree].weight += alpha ;
01642 }
01643 }
01644 else if (j==degree-1)
01645 {
01646
01647 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01648 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
01649 if (weights_in_tree)
01650 TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha*weights_column[j] ;
01651 else
01652 TreeMem[tree].child_weights[vec[i+j+seq_offset]] += alpha;
01653
01654 break;
01655 }
01656 else
01657 {
01658 bool use_seq = use_compact_terminal_nodes && (j>degree-16) ;
01659 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01660 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01661
01662 int32_t tmp = get_node((j==degree-2) && (!use_seq));
01663 if (use_seq)
01664 TreeMem[tree].children[vec[i+j+seq_offset]] = -tmp ;
01665 else
01666 TreeMem[tree].children[vec[i+j+seq_offset]] = tmp ;
01667 tree=tmp ;
01668
01669 TRIE_ASSERT((tree>=0) && (tree<TreeMemPtrMax)) ;
01670 #ifdef TRIE_CHECK_EVERYTHING
01671 TreeMem[tree].has_seq = use_seq ;
01672 #endif
01673 if (use_seq)
01674 {
01675 TreeMem[tree].weight = alpha ;
01676
01677 memset(TreeMem[tree].seq, TRIE_TERMINAL_CHARACTER, 16) ;
01678 for (int32_t q=0; (j+q<degree) && (i+j+seq_offset+q<length); q++)
01679 {
01680 TRIE_ASSERT(q<16) ;
01681 TreeMem[tree].seq[q]=vec[i+j+seq_offset+q] ;
01682 }
01683 break ;
01684 }
01685 else
01686 {
01687 if (weights_in_tree)
01688 TreeMem[tree].weight = alpha*weights_column[j] ;
01689 else
01690 TreeMem[tree].weight = alpha ;
01691 #ifdef TRIE_CHECK_EVERYTHING
01692 if (j==degree-2)
01693 TreeMem[tree].has_floats = true ;
01694 #endif
01695 }
01696 }
01697 }
01698 }
01699
01700 template <class Trie>
01701 float64_t CTrie<Trie>::compute_by_tree_helper(
01702 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
01703 int32_t weight_pos, float64_t* weights,
01704 bool degree_times_position_weights)
01705 {
01706 int32_t tree = trees[tree_pos] ;
01707
01708 if ((position_weights!=NULL) && (position_weights[weight_pos]==0))
01709 return 0.0;
01710
01711 float64_t *weights_column=NULL ;
01712 if (degree_times_position_weights)
01713 weights_column=&weights[weight_pos*degree] ;
01714 else
01715 weights_column=weights ;
01716
01717 float64_t sum=0 ;
01718 for (int32_t j=0; seq_pos+j < len; j++)
01719 {
01720 TRIE_ASSERT((vec[seq_pos+j]<4) && (vec[seq_pos+j]>=0)) ;
01721
01722 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01723 {
01724 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01725 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01726 {
01727 tree = - TreeMem[tree].children[vec[seq_pos+j]];
01728 TRIE_ASSERT(tree>=0) ;
01729 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01730 float64_t this_weight=0.0 ;
01731 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01732 {
01733 TRIE_ASSERT((vec[seq_pos+j+k]<4) && (vec[seq_pos+j+k]>=0)) ;
01734 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01735 break ;
01736 this_weight += weights_column[j+k] ;
01737 }
01738 sum += TreeMem[tree].weight * this_weight ;
01739 break ;
01740 }
01741 else
01742 {
01743 tree=TreeMem[tree].children[vec[seq_pos+j]];
01744 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01745 if (weights_in_tree)
01746 sum += TreeMem[tree].weight ;
01747 else
01748 sum += TreeMem[tree].weight * weights_column[j] ;
01749 } ;
01750 }
01751 else
01752 {
01753 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01754 if (j==degree-1)
01755 {
01756 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_floats) ;
01757 if (weights_in_tree)
01758 sum += TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01759 else
01760 sum += TreeMem[tree].child_weights[vec[seq_pos+j]] * weights_column[j] ;
01761 }
01762 else
01763 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_floats) ;
01764
01765 break;
01766 }
01767 }
01768
01769 if (position_weights!=NULL)
01770 return sum*position_weights[weight_pos] ;
01771 else
01772 return sum ;
01773 }
01774
01775 template <class Trie>
01776 void CTrie<Trie>::compute_by_tree_helper(
01777 int32_t* vec, int32_t len, int32_t seq_pos, int32_t tree_pos,
01778 int32_t weight_pos, float64_t* LevelContrib, float64_t factor,
01779 int32_t mkl_stepsize, float64_t * weights,
01780 bool degree_times_position_weights)
01781 {
01782 int32_t tree = trees[tree_pos] ;
01783 if (factor==0)
01784 return ;
01785
01786 if (position_weights!=NULL)
01787 {
01788 factor *= position_weights[weight_pos] ;
01789 if (factor==0)
01790 return ;
01791 if (!degree_times_position_weights)
01792 {
01793 for (int32_t j=0; seq_pos+j<len; j++)
01794 {
01795 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01796 {
01797 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01798 {
01799 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01800 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01801 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01802 {
01803 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01804 break ;
01805 if (weights_in_tree)
01806 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01807 else
01808 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
01809 }
01810 break ;
01811 }
01812 else
01813 {
01814 tree=TreeMem[tree].children[vec[seq_pos+j]];
01815 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01816 if (weights_in_tree)
01817 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01818 else
01819 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
01820 }
01821 }
01822 else
01823 {
01824 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01825 if (j==degree-1)
01826 {
01827 if (weights_in_tree)
01828 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01829 else
01830 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
01831 }
01832 }
01833 }
01834 }
01835 else
01836 {
01837 for (int32_t j=0; seq_pos+j<len; j++)
01838 {
01839 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01840 {
01841 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01842 {
01843 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01844 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01845 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01846 {
01847 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01848 break ;
01849 if (weights_in_tree)
01850 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01851 else
01852 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
01853 }
01854 break ;
01855 }
01856 else
01857 {
01858 tree=TreeMem[tree].children[vec[seq_pos+j]];
01859 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01860 if (weights_in_tree)
01861 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight ;
01862 else
01863 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+weight_pos*degree] ;
01864 }
01865 }
01866 else
01867 {
01868 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01869 if (j==degree-1)
01870 {
01871 if (weights_in_tree)
01872 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01873 else
01874 LevelContrib[weight_pos/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j+weight_pos*degree] ;
01875 }
01876 break ;
01877 }
01878 }
01879 }
01880 }
01881 else if (!degree_times_position_weights)
01882 {
01883 for (int32_t j=0; seq_pos+j<len; j++)
01884 {
01885 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01886 {
01887 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01888 {
01889 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01890 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01891 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01892 {
01893 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01894 break ;
01895 if (weights_in_tree)
01896 LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight ;
01897 else
01898 LevelContrib[(j+k)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k] ;
01899 }
01900 break ;
01901 }
01902 else
01903 {
01904 tree=TreeMem[tree].children[vec[seq_pos+j]];
01905 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01906 if (weights_in_tree)
01907 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight ;
01908 else
01909 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j] ;
01910 }
01911 }
01912 else
01913 {
01914 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01915 if (j==degree-1)
01916 {
01917 if (weights_in_tree)
01918 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01919 else
01920 LevelContrib[j/mkl_stepsize] += factor*TreeMem[tree].child_weights[vec[seq_pos+j]]*weights[j] ;
01921 }
01922 break ;
01923 }
01924 }
01925 }
01926 else
01927 {
01928
01929
01930
01931
01932
01933
01934
01935
01936
01937
01938
01939
01940
01941
01942
01943
01944
01945
01946 for (int32_t j=0; seq_pos+j<len; j++)
01947 {
01948 if ((j<degree-1) && (TreeMem[tree].children[vec[seq_pos+j]]!=NO_CHILD))
01949 {
01950 if (TreeMem[tree].children[vec[seq_pos+j]]<0)
01951 {
01952 tree = -TreeMem[tree].children[vec[seq_pos+j]];
01953 TRIE_ASSERT_EVERYTHING(TreeMem[tree].has_seq) ;
01954 for (int32_t k=0; (j+k<degree) && (seq_pos+j+k<length); k++)
01955 {
01956 if (TreeMem[tree].seq[k]!=vec[seq_pos+j+k])
01957 break ;
01958 if (weights_in_tree)
01959 LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight ;
01960 else
01961 LevelContrib[(j+k+degree*weight_pos)/mkl_stepsize] += factor*TreeMem[tree].weight*weights[j+k+weight_pos*degree] ;
01962 }
01963 break ;
01964 }
01965 else
01966 {
01967 tree=TreeMem[tree].children[vec[seq_pos+j]];
01968 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01969 if (weights_in_tree)
01970 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight ;
01971 else
01972 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].weight * weights[j+weight_pos*degree] ;
01973 }
01974 }
01975 else
01976 {
01977 TRIE_ASSERT_EVERYTHING(!TreeMem[tree].has_seq) ;
01978 if (j==degree-1)
01979 {
01980 if (weights_in_tree)
01981 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] ;
01982 else
01983 LevelContrib[(j+degree*weight_pos)/mkl_stepsize] += factor * TreeMem[tree].child_weights[vec[seq_pos+j]] * weights[j+weight_pos*degree] ;
01984 }
01985 break ;
01986 }
01987 }
01988 }
01989 }
01990
01991 template <class Trie>
01992 void CTrie<Trie>::fill_backtracking_table_recursion(
01993 Trie* tree, int32_t depth, uint64_t seq, float64_t value,
01994 CDynamicArray<ConsensusEntry>* table, float64_t* weights)
01995 {
01996 float64_t w=1.0;
01997
01998 if (weights_in_tree || depth==0)
01999 value+=tree->weight;
02000 else
02001 {
02002 w=weights[degree-1];
02003 value+=weights[depth-1]*tree->weight;
02004 }
02005
02006 if (degree-1==depth)
02007 {
02008 for (int32_t sym=0; sym<4; sym++)
02009 {
02010 float64_t v=w*tree->child_weights[sym];
02011 if (v!=0.0)
02012 {
02013 ConsensusEntry entry;
02014 entry.bt=-1;
02015 entry.score=value+v;
02016 entry.string=seq | ((uint64_t) sym) << (2*(degree-depth-1));
02017
02018 table->append_element(entry);
02019 }
02020 }
02021 }
02022 else
02023 {
02024 for (int32_t sym=0; sym<4; sym++)
02025 {
02026 uint64_t str=seq | ((uint64_t) sym) << (2*(degree-depth-1));
02027 if (tree->children[sym] != NO_CHILD)
02028 fill_backtracking_table_recursion(&TreeMem[tree->children[sym]], depth+1, str, value, table, weights);
02029 }
02030 }
02031 }
02032
02033 template <class Trie>
02034 float64_t CTrie<Trie>::get_cumulative_score(
02035 int32_t pos, uint64_t seq, int32_t deg, float64_t* weights)
02036 {
02037 float64_t result=0.0;
02038
02039
02040
02041 for (int32_t i=pos; i<pos+deg && i<length; i++)
02042 {
02043
02044 Trie* tree = &TreeMem[trees[i]];
02045
02046 for (int32_t d=0; d<deg-i+pos; d++)
02047 {
02048
02049 ASSERT(d-1<degree);
02050 int32_t sym = (int32_t) (seq >> (2*(deg-1-d-i+pos)) & 3);
02051
02052 float64_t w=1.0;
02053 if (!weights_in_tree)
02054 w=weights[d];
02055
02056 ASSERT(tree->children[sym] != NO_CHILD);
02057 tree=&TreeMem[tree->children[sym]];
02058 result+=w*tree->weight;
02059 }
02060 }
02061
02062 return result;
02063 }
02064
02065 template <class Trie>
02066 void CTrie<Trie>::fill_backtracking_table(
02067 int32_t pos, CDynamicArray<ConsensusEntry>* prev,
02068 CDynamicArray<ConsensusEntry>* cur, bool cumulative, float64_t* weights)
02069 {
02070 ASSERT(pos>=0 && pos<length);
02071 ASSERT(!use_compact_terminal_nodes);
02072
02073 Trie* t = &TreeMem[trees[pos]];
02074
02075 fill_backtracking_table_recursion(t, 0, (uint64_t) 0, 0.0, cur, weights);
02076
02077
02078 if (cumulative)
02079 {
02080 int32_t num_cur=cur->get_num_elements();
02081 for (int32_t i=0; i<num_cur; i++)
02082 {
02083 ConsensusEntry entry=cur->get_element(i);
02084 entry.score+=get_cumulative_score(pos+1, entry.string, degree-1, weights);
02085 cur->set_element(entry,i);
02086
02087 }
02088 }
02089
02090
02091
02092 if (prev)
02093 {
02094 int32_t num_cur=cur->get_num_elements();
02095 int32_t num_prev=prev->get_num_elements();
02096
02097 for (int32_t i=0; i<num_cur; i++)
02098 {
02099
02100 uint64_t str_cur= cur->get_element(i).string >> 2;
02101
02102
02103 int32_t bt=-1;
02104 float64_t max_score=0.0;
02105
02106 for (int32_t j=0; j<num_prev; j++)
02107 {
02108
02109 uint64_t mask=
02110 ((((uint64_t)0)-1) ^ (((uint64_t) 3) << (2*(degree-1))));
02111 uint64_t str_prev= mask & prev->get_element(j).string;
02112
02113
02114 if (str_cur == str_prev)
02115 {
02116 float64_t sc=prev->get_element(j).score+cur->get_element(i).score;
02117 if (bt==-1 || sc>max_score)
02118 {
02119 bt=j;
02120 max_score=sc;
02121
02122
02123 }
02124 }
02125 }
02126
02127 ASSERT(bt!=-1);
02128 ConsensusEntry entry;
02129 entry.bt=bt;
02130 entry.score=max_score;
02131 entry.string=cur->get_element(i).string;
02132 cur->set_element(entry, i);
02133
02134 }
02135 }
02136 }
02137 #endif // _TRIE_H___