LinearHMM.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "distributions/hmm/LinearHMM.h"
00013 #include "lib/common.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016 
00017 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f)
00018 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL)
00019 {
00020     features=f;
00021     sequence_length = f->get_vector_length(0);
00022     num_symbols     = (int32_t) f->get_num_symbols();
00023     num_params      = sequence_length*num_symbols;
00024 }
00025 
00026 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols)
00027 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL)
00028 {
00029     sequence_length = p_num_features;
00030     num_symbols     = p_num_symbols;
00031     num_params      = sequence_length*num_symbols;
00032 }
00033 
00034 CLinearHMM::~CLinearHMM()
00035 {
00036     delete[] transition_probs;
00037     delete[] log_transition_probs;
00038 }
00039 
00040 bool CLinearHMM::train()
00041 {
00042     delete[] transition_probs;
00043     delete[] log_transition_probs;
00044     int32_t* int_transition_probs=new int32_t[num_params];
00045 
00046     int32_t vec;
00047     int32_t i;
00048 
00049     for (i=0; i< num_params; i++)
00050         int_transition_probs[i]=0;
00051 
00052     for (vec=0; vec<features->get_num_vectors(); vec++)
00053     {
00054         int32_t len;
00055 
00056         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00057             get_feature_vector(vec, len);
00058 
00059         //just count the symbols per position -> transition_probsogram
00060         for (int32_t feat=0; feat<len ; feat++)
00061             int_transition_probs[feat*num_symbols+vector[feat]]++;
00062     }
00063 
00064     //trade memory for speed
00065     transition_probs=new float64_t[num_params];
00066     log_transition_probs=new float64_t[num_params];
00067 
00068     for (i=0;i<sequence_length;i++)
00069     {
00070         for (int32_t j=0; j<num_symbols; j++)
00071         {
00072             float64_t sum=0;
00073             int32_t offs=i*num_symbols+
00074                 ((CStringFeatures<uint16_t> *) features)->
00075                     get_masked_symbols((uint16_t)j,(uint8_t) 254);
00076             int32_t original_num_symbols=(int32_t)
00077                 ((CStringFeatures<uint16_t> *) features)->
00078                     get_original_num_symbols();
00079 
00080             for (int32_t k=0; k<original_num_symbols; k++)
00081                 sum+=int_transition_probs[offs+k];
00082 
00083             transition_probs[i*num_symbols+j]=
00084                 (int_transition_probs[i*num_symbols+j]+pseudo_count)/
00085                 (sum+((CStringFeatures<uint16_t> *) features)->
00086                     get_original_num_symbols()*pseudo_count);
00087             log_transition_probs[i*num_symbols+j]=
00088                 log(transition_probs[i*num_symbols+j]);
00089         }
00090     }
00091 
00092     delete[] int_transition_probs;
00093     return true;
00094 }
00095 
00096 bool CLinearHMM::train(
00097     const int32_t* indizes, int32_t num_indizes, float64_t pseudo)
00098 {
00099     delete[] transition_probs;
00100     delete[] log_transition_probs;
00101     int32_t* int_transition_probs=new int32_t[num_params];
00102     int32_t vec;
00103     int32_t i;
00104 
00105     for (i=0; i< num_params; i++)
00106         int_transition_probs[i]=0;
00107 
00108     for (vec=0; vec<num_indizes; vec++)
00109     {
00110         int32_t len;
00111 
00112         ASSERT(indizes[vec]>=0 &&
00113             indizes[vec]<((CStringFeatures<uint16_t>*) features)->
00114                 get_num_vectors());
00115         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00116             get_feature_vector(indizes[vec], len);
00117 
00118         //just count the symbols per position -> transition_probsogram
00119         //
00120         for (int32_t feat=0; feat<len ; feat++)
00121             int_transition_probs[feat*num_symbols+vector[feat]]++;
00122     }
00123 
00124     //trade memory for speed
00125     transition_probs=new float64_t[num_params];
00126     log_transition_probs=new float64_t[num_params];
00127 
00128     for (i=0;i<sequence_length;i++)
00129     {
00130         for (int32_t j=0; j<num_symbols; j++)
00131         {
00132             float64_t sum=0;
00133             int32_t original_num_symbols=(int32_t)
00134                 ((CStringFeatures<uint16_t> *) features)->
00135                     get_original_num_symbols();
00136             for (int32_t k=0; k<original_num_symbols; k++)
00137             {
00138                 sum+=int_transition_probs[i*num_symbols+
00139                     ((CStringFeatures<uint16_t>*) features)->
00140                         get_masked_symbols((uint16_t)j,(uint8_t) 254)+k];
00141             }
00142 
00143             transition_probs[i*num_symbols+j]=
00144                 (int_transition_probs[i*num_symbols+j]+pseudo)/
00145                 (sum+((CStringFeatures<uint16_t>*) features)->
00146                     get_original_num_symbols()*pseudo);
00147             log_transition_probs[i*num_symbols+j]=
00148                 log(transition_probs[i*num_symbols+j]);
00149         }
00150     }
00151 
00152     delete[] int_transition_probs;
00153     return true;
00154 }
00155 
00156 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len)
00157 {
00158     float64_t result=log_transition_probs[vector[0]];
00159 
00160     for (int32_t i=1; i<len; i++)
00161         result+=log_transition_probs[i*num_symbols+vector[i]];
00162     
00163     return result;
00164 }
00165 
00166 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example)
00167 {
00168     int32_t len;
00169     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00170         get_feature_vector(num_example, len);
00171     float64_t result=log_transition_probs[vector[0]];
00172 
00173     for (int32_t i=1; i<len; i++)
00174         result+=log_transition_probs[i*num_symbols+vector[i]];
00175 
00176     return result;
00177 }
00178 
00179 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len)
00180 {
00181     float64_t result=transition_probs[vector[0]];
00182 
00183     for (int32_t i=1; i<len; i++)
00184         result*=transition_probs[i*num_symbols+vector[i]];
00185     
00186     return result;
00187 }
00188 
00189 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example)
00190 {
00191     int32_t len;
00192     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00193         get_feature_vector(num_example, len);
00194     float64_t result=0;
00195     int32_t position=num_param/num_symbols;
00196     ASSERT(position>=0 && position<len);
00197     uint16_t sym=(uint16_t) (num_param-position*num_symbols);
00198 
00199     if (vector[position]==sym && transition_probs[num_param]!=0)
00200         result=1.0/transition_probs[num_param];
00201 
00202     return result;
00203 }
00204 
00205 void CLinearHMM::get_transition_probs(float64_t** dst, int32_t* num)
00206 {
00207     *num=num_params;
00208     size_t sz=sizeof(*transition_probs)*(*num);
00209     *dst=(float64_t*) malloc(sz);
00210     ASSERT(dst);
00211 
00212     memcpy(*dst, transition_probs, sz);
00213 }
00214 
00215 bool CLinearHMM::set_transition_probs(const float64_t* src, int32_t num)
00216 {
00217     if (num!=-1)
00218         ASSERT(num==num_params);
00219 
00220     if (!log_transition_probs)
00221         log_transition_probs=new float64_t[num_params];
00222 
00223     if (!transition_probs)
00224         transition_probs=new float64_t[num_params];
00225 
00226     for (int32_t i=0; i<num_params; i++)
00227     {
00228         transition_probs[i]=src[i];
00229         log_transition_probs[i]=log(transition_probs[i]);
00230     }
00231 
00232     return true;
00233 }
00234 
00235 void CLinearHMM::get_log_transition_probs(float64_t** dst, int32_t* num)
00236 {
00237     *num=num_params;
00238     size_t sz=sizeof(*log_transition_probs)*(*num);
00239     *dst=(float64_t*) malloc(sz);
00240     ASSERT(dst);
00241 
00242     memcpy(*dst, log_transition_probs, sz);
00243 }
00244 
00245 bool CLinearHMM::set_log_transition_probs(const float64_t* src, int32_t num)
00246 {
00247     if (num!=-1)
00248         ASSERT(num==num_params);
00249 
00250     if (!log_transition_probs)
00251         log_transition_probs=new float64_t[num_params];
00252 
00253     if (!transition_probs)
00254         transition_probs=new float64_t[num_params];
00255 
00256     for (int32_t i=0; i< num_params; i++)
00257     {
00258         log_transition_probs[i]=src[i];
00259         transition_probs[i]=exp(log_transition_probs[i]);
00260     }
00261 
00262     return true;
00263 }
00264 
00265 
00266 
00267 

SHOGUN Machine Learning Toolbox - Documentation