00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "kernel/OligoStringKernel.h"
00015 #include "kernel/SqrtDiagKernelNormalizer.h"
00016 #include "features/StringFeatures.h"
00017
00018 #include <map>
00019 #include <vector>
00020 #include <algorithm>
00021
00022 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00023 : CStringKernel<char>(cache_sz), k(kmer_len), width(w), gauss_table(NULL)
00024 {
00025 set_normalizer(new CSqrtDiagKernelNormalizer());
00026 }
00027
00028 COligoStringKernel::~COligoStringKernel()
00029 {
00030 cleanup();
00031 }
00032
00033 void COligoStringKernel::cleanup()
00034 {
00035 delete[] gauss_table;
00036 gauss_table=NULL;
00037
00038 CKernel::cleanup();
00039 }
00040
00041 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
00042 {
00043 cleanup();
00044
00045 CStringKernel<char>::init(l,r);
00046 int32_t max_len=CMath::max(
00047 ((CStringFeatures<char>*) l)->get_max_vector_length(),
00048 ((CStringFeatures<char>*) r)->get_max_vector_length()
00049 );
00050
00051 getExpFunctionCache(max_len);
00052 return init_normalizer();
00053 }
00054
00055 void COligoStringKernel::encodeOligo(
00056 const std::string& sequence, uint32_t k_mer_length,
00057 const std::string& allowed_characters,
00058 std::vector< std::pair<int32_t, float64_t> >& values)
00059 {
00060 float64_t oligo_value = 0.;
00061 float64_t factor = 1.;
00062 std::map<std::string::value_type, uint32_t> residue_values;
00063 uint32_t counter = 0;
00064 uint32_t number_of_residues = allowed_characters.size();
00065 uint32_t sequence_length = sequence.size();
00066 bool sequence_ok = true;
00067
00068
00069 for (uint32_t i = 0; i < sequence.size(); ++i)
00070 {
00071 if (allowed_characters.find(sequence.at(i)) == std::string::npos)
00072 sequence_ok = false;
00073 }
00074
00075 if (sequence_ok && k_mer_length <= sequence_length)
00076 {
00077 values.resize(sequence_length - k_mer_length + 1,
00078 std::pair<int32_t, float64_t>());
00079 for (uint32_t i = 0; i < number_of_residues; ++i)
00080 {
00081 residue_values.insert(std::make_pair(allowed_characters[i], counter));
00082 ++counter;
00083 }
00084 for (int32_t k = k_mer_length - 1; k >= 0; k--)
00085 {
00086 oligo_value += factor * residue_values[sequence[k]];
00087 factor *= number_of_residues;
00088 }
00089 factor /= number_of_residues;
00090 counter = 0;
00091 values[counter].first = 1;
00092 values[counter].second = oligo_value;
00093 ++counter;
00094
00095 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00096 {
00097 oligo_value -= factor * residue_values[sequence[j - 1]];
00098 oligo_value = oligo_value * number_of_residues +
00099 residue_values[sequence[j + k_mer_length - 1]];
00100
00101 values[counter].first = j + 1;
00102 values[counter].second = oligo_value ;
00103 ++counter;
00104 }
00105 stable_sort(values.begin(), values.end(), cmpOligos_);
00106 }
00107 else
00108 {
00109 values.clear();
00110 }
00111 }
00112
00113 void COligoStringKernel::getSequences(
00114 const std::vector<std::string>& sequences, uint32_t k_mer_length,
00115 const std::string& allowed_characters,
00116 std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
00117 {
00118 std::vector< std::pair<int32_t, float64_t> > temp_vector;
00119 encoded_sequences.resize(sequences.size(),
00120 std::vector< std::pair<int32_t, float64_t> >());
00121
00122 for (uint32_t i = 0; i < sequences.size(); ++i)
00123 {
00124 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00125 encoded_sequences[i] = temp_vector;
00126 }
00127 }
00128
00129 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
00130 {
00131 delete[] gauss_table;
00132 gauss_table=new float64_t[sequence_length];
00133
00134 gauss_table[0] = 1;
00135 for (uint32_t i = 1; i < sequence_length - 1; i++)
00136 gauss_table[i] = exp((-1 / (CMath::sq(width))) * CMath::sq(i));
00137 }
00138
00139 float64_t COligoStringKernel::kernelOligoFast(
00140 const std::vector< std::pair<int32_t, float64_t> >& x,
00141 const std::vector< std::pair<int32_t, float64_t> >& y,
00142 int32_t max_distance)
00143 {
00144 float64_t result = 0;
00145 int32_t i1 = 0;
00146 int32_t i2 = 0;
00147 int32_t c1 = 0;
00148 uint32_t x_size = x.size();
00149 uint32_t y_size = y.size();
00150
00151 while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00152 {
00153 if (x[i1].second == y[i2].second)
00154 {
00155 if (max_distance < 0
00156 || (abs(x[i1].first - y[i2].first)) <= max_distance)
00157 {
00158 result += gauss_table[abs((x[i1].first - y[i2].first))];
00159 if (x[i1].second == x[i1 + 1].second)
00160 {
00161 i1++;
00162 c1++;
00163 }
00164 else if (y[i2].second == y[i2 + 1].second)
00165 {
00166 i2++;
00167 i1 -= c1;
00168 c1 = 0;
00169 }
00170 else
00171 {
00172 i1++;
00173 i2++;
00174 }
00175 }
00176 else
00177 {
00178 if (x[i1].first < y[i2].first)
00179 {
00180 if (x[i1].second == x[i1 + 1].second)
00181 {
00182 i1++;
00183 }
00184 else if (y[i2].second == y[i2 + 1].second)
00185 {
00186 while(y[i2++].second == y[i2].second)
00187 {
00188 ;
00189 }
00190 ++i1;
00191 c1 = 0;
00192 }
00193 else
00194 {
00195 i1++;
00196 i2++;
00197 c1 = 0;
00198 }
00199 }
00200 else
00201 {
00202 i2++;
00203 i1 -= c1;
00204 c1 = 0;
00205 }
00206 }
00207 }
00208 else
00209 {
00210 if (x[i1].second < y[i2].second)
00211 i1++;
00212 else
00213 i2++;
00214 c1 = 0;
00215 }
00216 }
00217 return result;
00218 }
00219
00220
00221 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
00222 {
00223 int32_t alen, blen;
00224 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00225 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00226 std::vector< std::pair<int32_t, float64_t> > aenc;
00227 std::vector< std::pair<int32_t, float64_t> > benc;
00228 encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
00229 encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
00230 float64_t result=kernelOligoFast(aenc, benc);
00231 return result;
00232 }
00233