00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "distributions/histogram/Histogram.h" 00013 #include "lib/common.h" 00014 #include "features/StringFeatures.h" 00015 #include "lib/io.h" 00016 #include "lib/Mathematics.h" 00017 00018 00019 CHistogram::CHistogram() 00020 : CDistribution() 00021 { 00022 hist=new float64_t[1<<16]; 00023 } 00024 00025 CHistogram::CHistogram(CStringFeatures<uint16_t> *f) 00026 : CDistribution() 00027 { 00028 hist=new float64_t[1<<16]; 00029 features=f; 00030 } 00031 00032 CHistogram::~CHistogram() 00033 { 00034 delete[] hist; 00035 } 00036 00037 bool CHistogram::train() 00038 { 00039 int32_t vec; 00040 int32_t feat; 00041 int32_t i; 00042 00043 ASSERT(features); 00044 ASSERT(features->get_feature_class()==C_STRING); 00045 ASSERT(features->get_feature_type()==F_WORD); 00046 00047 for (i=0; i< (int32_t) (1<<16); i++) 00048 hist[i]=0; 00049 00050 for (vec=0; vec<features->get_num_vectors(); vec++) 00051 { 00052 int32_t len; 00053 00054 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00055 get_feature_vector(vec, len); 00056 00057 for (feat=0; feat<len ; feat++) 00058 hist[vector[feat]]++; 00059 } 00060 00061 for (i=0; i< (int32_t) (1<<16); i++) 00062 hist[i]=log(hist[i]); 00063 00064 return true; 00065 } 00066 00067 float64_t CHistogram::get_log_likelihood_example(int32_t num_example) 00068 { 00069 ASSERT(features); 00070 ASSERT(features->get_feature_class()==C_STRING); 00071 ASSERT(features->get_feature_type()==F_WORD); 00072 00073 int32_t len; 00074 float64_t loglik=0; 00075 00076 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00077 get_feature_vector(num_example, len); 00078 00079 for (int32_t i=0; i<len; i++) 00080 loglik+=hist[vector[i]]; 00081 00082 return loglik; 00083 } 00084 00085 float64_t CHistogram::get_log_derivative(int32_t num_param, int32_t num_example) 00086 { 00087 if (hist[num_param] < CMath::ALMOST_NEG_INFTY) 00088 return -CMath::INFTY; 00089 else 00090 { 00091 ASSERT(features); 00092 ASSERT(features->get_feature_class()==C_STRING); 00093 ASSERT(features->get_feature_type()==F_WORD); 00094 00095 int32_t len; 00096 float64_t deriv=0; 00097 00098 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00099 get_feature_vector(num_example, len); 00100 00101 int32_t num_occurences=0; 00102 00103 for (int32_t i=0; i<len; i++) 00104 { 00105 deriv+=hist[vector[i]]; 00106 00107 if (vector[i]==num_param) 00108 num_occurences++; 00109 } 00110 00111 if (num_occurences>0) 00112 deriv+=CMath::log((float64_t) num_occurences)-hist[num_param]; 00113 else 00114 deriv=-CMath::INFTY; 00115 00116 return deriv; 00117 } 00118 } 00119 00120 float64_t CHistogram::get_log_model_parameter(int32_t num_param) 00121 { 00122 return hist[num_param]; 00123 } 00124 00125 bool CHistogram::set_histogram(float64_t* src, int32_t num) 00126 { 00127 ASSERT(num==get_num_model_parameters()); 00128 00129 delete[] hist; 00130 hist=new float64_t[num]; 00131 for (int32_t i=0; i<num; i++) { 00132 hist[i]=src[i]; 00133 } 00134 00135 return true; 00136 } 00137 00138 void CHistogram::get_histogram(float64_t** dst, int32_t* num) 00139 { 00140 *num=get_num_model_parameters(); 00141 size_t sz=sizeof(*hist)*(*num); 00142 *dst=(float64_t*) malloc(sz); 00143 ASSERT(dst); 00144 00145 memcpy(*dst, hist, sz); 00146 } 00147