OligoStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2008 Christian Igel, Tobias Glasmachers
00008  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
00009  *
00010  * Shogun adjustments (W) 2008-2009 Soeren Sonnenburg
00011  * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00012  *
00013  */
00014 #include "kernel/OligoStringKernel.h"
00015 #include "kernel/SqrtDiagKernelNormalizer.h"
00016 #include "features/StringFeatures.h"
00017 
00018 #include <map>
00019 #include <vector>
00020 #include <algorithm>
00021 
00022 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00023 : CStringKernel<char>(cache_sz), k(kmer_len), width(w), gauss_table(NULL)
00024 {
00025     set_normalizer(new CSqrtDiagKernelNormalizer());
00026 }
00027 
00028 COligoStringKernel::~COligoStringKernel()
00029 {
00030     cleanup();
00031 }
00032 
00033 void COligoStringKernel::cleanup()
00034 {
00035     delete[] gauss_table;
00036     gauss_table=NULL;
00037 
00038     CKernel::cleanup();
00039 }
00040 
00041 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
00042 {
00043     cleanup();
00044 
00045     CStringKernel<char>::init(l,r);
00046     int32_t max_len=CMath::max(
00047             ((CStringFeatures<char>*) l)->get_max_vector_length(),
00048             ((CStringFeatures<char>*) r)->get_max_vector_length()
00049             );
00050 
00051     getExpFunctionCache(max_len);
00052     return init_normalizer();
00053 }
00054 
00055 void COligoStringKernel::encodeOligo(
00056     const std::string& sequence, uint32_t k_mer_length,
00057     const std::string& allowed_characters,
00058     std::vector< std::pair<int32_t, float64_t> >& values)
00059 {
00060     float64_t oligo_value = 0.;
00061     float64_t factor      = 1.;
00062     std::map<std::string::value_type, uint32_t> residue_values;
00063     uint32_t counter = 0;
00064     uint32_t number_of_residues = allowed_characters.size();
00065     uint32_t sequence_length = sequence.size();
00066     bool sequence_ok = true;
00067 
00068     // checking if sequence contains illegal characters
00069     for (uint32_t i = 0; i < sequence.size(); ++i)
00070     {
00071         if (allowed_characters.find(sequence.at(i)) == std::string::npos)
00072             sequence_ok = false;
00073     }
00074 
00075     if (sequence_ok && k_mer_length <= sequence_length)
00076     {
00077         values.resize(sequence_length - k_mer_length + 1,
00078             std::pair<int32_t, float64_t>());
00079         for (uint32_t i = 0; i < number_of_residues; ++i)
00080         {   
00081             residue_values.insert(std::make_pair(allowed_characters[i], counter));
00082             ++counter;
00083         }
00084         for (int32_t k = k_mer_length - 1; k >= 0; k--)
00085         {
00086             oligo_value += factor * residue_values[sequence[k]];
00087             factor *= number_of_residues;
00088         }
00089         factor /= number_of_residues;
00090         counter = 0;
00091         values[counter].first = 1;
00092         values[counter].second = oligo_value;
00093         ++counter;
00094 
00095         for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00096         {
00097             oligo_value -= factor * residue_values[sequence[j - 1]];
00098             oligo_value = oligo_value * number_of_residues +
00099                 residue_values[sequence[j + k_mer_length - 1]];
00100 
00101             values[counter].first = j + 1;
00102             values[counter].second = oligo_value ;
00103             ++counter;
00104         }
00105         stable_sort(values.begin(), values.end(), cmpOligos_);
00106     }
00107     else
00108     {
00109         values.clear();
00110     }   
00111 }
00112 
00113 void COligoStringKernel::getSequences(
00114     const std::vector<std::string>& sequences, uint32_t k_mer_length,
00115     const std::string& allowed_characters,
00116     std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
00117 {
00118     std::vector< std::pair<int32_t, float64_t> > temp_vector;
00119     encoded_sequences.resize(sequences.size(),
00120         std::vector< std::pair<int32_t, float64_t> >());
00121 
00122     for (uint32_t i = 0; i < sequences.size(); ++i)
00123     {
00124         encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00125         encoded_sequences[i] = temp_vector;
00126     }
00127 }
00128 
00129 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
00130 {
00131     delete[] gauss_table;
00132     gauss_table=new float64_t[sequence_length];
00133 
00134     gauss_table[0] = 1;
00135     for (uint32_t i = 1; i < sequence_length - 1; i++)
00136         gauss_table[i] = exp((-1 / (CMath::sq(width))) * CMath::sq(i));
00137 }
00138 
00139 float64_t COligoStringKernel::kernelOligoFast(
00140     const std::vector< std::pair<int32_t, float64_t> >& x,
00141     const std::vector< std::pair<int32_t, float64_t> >& y,
00142     int32_t max_distance)
00143 {
00144     float64_t result = 0;
00145     int32_t  i1     = 0;
00146     int32_t  i2     = 0;
00147     int32_t  c1     = 0;
00148     uint32_t x_size = x.size();
00149     uint32_t y_size = y.size();
00150 
00151     while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00152     {
00153         if (x[i1].second == y[i2].second)
00154         {
00155             if (max_distance < 0
00156                     || (abs(x[i1].first - y[i2].first)) <= max_distance)
00157             {
00158                 result += gauss_table[abs((x[i1].first - y[i2].first))];
00159                 if (x[i1].second == x[i1 + 1].second)
00160                 {
00161                     i1++;
00162                     c1++;
00163                 }
00164                 else if (y[i2].second == y[i2 + 1].second)
00165                 {
00166                     i2++;
00167                     i1 -= c1;
00168                     c1 = 0;
00169                 }
00170                 else
00171                 {
00172                     i1++;
00173                     i2++;
00174                 }
00175             }
00176             else
00177             {
00178                 if (x[i1].first < y[i2].first)
00179                 {
00180                     if (x[i1].second == x[i1 + 1].second)
00181                     {
00182                         i1++;
00183                     }
00184                     else if (y[i2].second == y[i2 + 1].second)
00185                     {
00186                         while(y[i2++].second == y[i2].second)
00187                         {
00188                             ;
00189                         }
00190                         ++i1;
00191                         c1 = 0;
00192                     }
00193                     else
00194                     {
00195                         i1++;
00196                         i2++;
00197                         c1 = 0;
00198                     }
00199                 }
00200                 else
00201                 {
00202                     i2++;
00203                     i1 -= c1;
00204                     c1 = 0;
00205                 }
00206             }
00207         }
00208         else
00209         {
00210             if (x[i1].second < y[i2].second)
00211                 i1++;
00212             else
00213                 i2++;
00214             c1 = 0;
00215         }
00216     }
00217     return result;
00218 }       
00219 
00220 
00221 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
00222 {
00223     int32_t alen, blen;
00224     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00225     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00226     std::vector< std::pair<int32_t, float64_t> > aenc;
00227     std::vector< std::pair<int32_t, float64_t> > benc;
00228     encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
00229     encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
00230     float64_t result=kernelOligoFast(aenc, benc);
00231     return result;
00232 }
00233 

SHOGUN Machine Learning Toolbox - Documentation