SpectrumRBFKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <vector>
00013 
00014 #include "lib/common.h"
00015 #include "lib/io.h"
00016 #include "lib/Signal.h"
00017 #include "lib/Trie.h"
00018 #include "base/Parallel.h"
00019 
00020 #include "kernel/SpectrumRBFKernel.h"
00021 #include "features/Features.h"
00022 #include "features/StringFeatures.h"
00023 #include <math.h>
00024 
00025 #include <vector>
00026 #include <string>
00027 #include <fstream>
00028 #include <cmath>
00029 
00030 #include <assert.h>
00031 
00032 #ifndef WIN32
00033 #include <pthread.h>
00034 #endif
00035 
00036 
00037 using namespace shogun;
00038 
00039 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_)
00040   : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00041 {
00042     lhs=NULL;
00043     rhs=NULL;
00044 
00045     target_letter_0=-1 ;
00046 
00047     AA_matrix=new float64_t[128*128];
00048     
00049 
00050     memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00051 
00052     read_profiles_and_sequences();
00053 
00054     //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
00055     string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, IUPAC_AMINO_ACID);
00056     init(string_features, string_features);
00057 }
00058 
00059 CSpectrumRBFKernel::CSpectrumRBFKernel(
00060     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_)
00061 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0)
00062 {
00063     target_letter_0=-1 ;
00064 
00065     AA_matrix=new float64_t[128*128];
00066     memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00067 
00068     init(l, r);
00069 }
00070 
00071 CSpectrumRBFKernel::~CSpectrumRBFKernel()
00072 {
00073     cleanup();
00074     delete[] AA_matrix ;
00075 }
00076 
00077 
00078 void CSpectrumRBFKernel::remove_lhs()
00079 {
00080 
00081     CKernel::remove_lhs();
00082 }
00083 
00084 void CSpectrumRBFKernel::read_profiles_and_sequences()
00085 {
00086 
00087         int32_t aa_to_index[128];//profile
00088         aa_to_index['A'] = 0;
00089         aa_to_index['R'] = 1;
00090         aa_to_index['N'] = 2;
00091         aa_to_index['D'] = 3;
00092         aa_to_index['C'] = 4;
00093         aa_to_index['Q'] = 5;
00094         aa_to_index['E'] = 6;
00095         aa_to_index['G'] = 7;
00096         aa_to_index['H'] = 8;
00097         aa_to_index['I'] = 9;
00098         aa_to_index['L'] = 10;
00099         aa_to_index['K'] = 11;
00100         aa_to_index['M'] = 12;
00101         aa_to_index['F'] = 13;
00102         aa_to_index['P'] = 14;
00103         aa_to_index['S'] = 15;
00104         aa_to_index['T'] = 16;
00105         aa_to_index['W'] = 17;
00106         aa_to_index['Y'] = 18;
00107         aa_to_index['V'] = 19;
00108     SG_DEBUG("initializing background\n");
00109     double background[20]; // profile
00110     background[0]=0.0799912015849807; //A
00111   background[1]=0.0484482507611578;//R
00112   background[2]=0.044293531582512;//N
00113   background[3]=0.0578891399707563;//D
00114   background[4]=0.0171846021407367;//C
00115   background[5]=0.0380578923048682;//Q
00116   background[6]=0.0638169929675978;//E
00117   background[7]=0.0760659374742852;//G
00118     background[8]=0.0223465499452473;//H
00119     background[9]=0.0550905793661343;//I
00120     background[10]=0.0866897071203864;//L
00121     background[11]=0.060458245507428;//K
00122     background[12]=0.0215379186368154;//M
00123     background[13]=0.0396348024787477;//F
00124     background[14]=0.0465746314476874;//P
00125     background[15]=0.0630028230885602;//S
00126     background[16]=0.0580394726014824;//T
00127     background[17]=0.0144991866213453;//W
00128     background[18]=0.03635438623143;//Y
00129     background[19]=0.0700241481678408;//V
00130 
00131 
00132     std::vector<std::string> seqs;
00133     //int32_t nof_sequences = 7329;
00134 
00135     double C = 0.8;
00136     const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
00137     std::ifstream fin(filename);    
00138     
00139     SG_DEBUG("Reading profiles from %s\n", filename);
00140     std::string line;
00141   while (!fin.eof())
00142     { 
00143         std::getline(fin, line);
00144 
00145         if (line[0] == '>') // new sequence
00146         { 
00147             int idx = line.find_first_of(' ');
00148             sequence_labels.push_back(line.substr(1,idx-1));
00149             std::getline(fin, line);
00150             std::string orig_sequence = line;
00151             std::string sequence="";
00152 
00153             int len_line = line.length();
00154 
00155             // skip 3 lines
00156 
00157             std::getline(fin, line);
00158             std::getline(fin, line);
00159             std::getline(fin, line);
00160 
00161             profiles.push_back(std::vector<double>());
00162 
00163             std::vector<double>& curr_profile = profiles.back();
00164             for (int i=0; i < len_line; ++i)
00165             { 
00166                     std::getline(fin, line);
00167                     int a = line.find_first_not_of(' '); // index position
00168                     int b = line.find_first_of(' ', a); // index position
00169                     a = line.find_first_not_of(' ', b); // aa position
00170                     b = line.find_first_of(' ', a); // aa position
00171                     std::string aa=line.substr(a,b-a);
00172                     if (0) //(aa =="B" || aa == "X" || aa == "Z")
00173                     {
00174                         int pos = seqs.size()+1;
00175                         SG_DEBUG("Skipping aa in sequence %d\n", pos);
00176                     continue;
00177             }
00178                     else
00179                     {
00180                         sequence += aa;
00181 
00182                         a = line.find_first_not_of(' ', b); // beginning of block to ignore
00183                         b = line.find_first_of(' ', a); // aa position
00184 
00185                         for (int j=0; j < 19; ++j)
00186                         { 
00187                             a = line.find_first_not_of(' ', b);
00188                             b = line.find_first_of(' ', a);
00189                         }
00190 
00191                         int all_zeros = 1;
00192                         // interesting block
00193                         for (int j=0; j < 20; ++j)
00194                         { 
00195                             a = line.find_first_not_of(' ', b);
00196                             b = line.find_first_of(' ', a);
00197                             double p = atof(line.substr(a, b-a).c_str());
00198                             if (p > 0)
00199                             {
00200                                 all_zeros = 0;
00201                             }
00202                             double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
00203                             curr_profile.push_back(value);
00204                             //SG_DEBUG("seq %d aa %d value %f p %f  bg %f\n", i, j, value,p, background[j]);
00205                         }
00206 
00207                         if (all_zeros)
00208                         {
00209                             SG_DEBUG(">>>>>>>>>>>>>>> all zeros");
00210                             if (aa !="B" && aa != "X" && aa != "Z")
00211                             {
00212                                 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]);
00213                                 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
00214                                 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C)
00215                                 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]);
00216                                 curr_profile[(i*20) + aa_index] = value;
00217                                 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value);
00218                                 
00219                                 /*
00220                                 for (int z=0; z <20; ++z)
00221                                 {
00222                                     SG_DEBUG(" %d \t %f\t", z, curr_profile[z]);
00223                                 }
00224                                 SG_DEBUG("\n");
00225                                 */
00226                             }
00227                         }
00228                     }
00229             }
00230 
00231             if (curr_profile.size() != 20 * sequence.length())
00232         { 
00233                 SG_ERROR("Something's wrong with the profile.\n");
00234                 break;
00235             }
00236 
00237             seqs.push_back(sequence);
00238 
00239 
00240             /*
00241             // 6 irrelevant lines
00242             for (int i=0; i < 6; ++i)
00243             { 
00244                 std::getline(fin, line);
00245             }
00246             //
00247             */
00248         }
00249     }
00250     
00251     fin.close();
00252 
00253     nof_sequences = seqs.size();
00254     sequences = new T_STRING<char>[nof_sequences];
00255 
00256     int max_len = 0;
00257     for (int i=0; i < nof_sequences; ++i)
00258     {
00259         int len = seqs[i].length();
00260         sequences[i].string = new char[len+1];
00261         sequences[i].length = len;
00262         strcpy(sequences[i].string, seqs[i].c_str());
00263 
00264         if (len > max_len) max_len = len;
00265     }
00266 
00267     max_sequence_length = max_len;
00268     //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN);
00269 
00270 }
00271 
00272 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r)
00273 {
00274     // >> profile
00275 /*
00276     read_profiles_and_sequences();
00277     l = string_features;
00278     r = string_features;
00279     */
00280     // << profile
00281 
00282     int32_t lhs_changed=(lhs!=l);
00283     int32_t rhs_changed=(rhs!=r);
00284 
00285     CStringKernel<char>::init(l,r);
00286 
00287     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00288     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00289 
00290     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00291     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00292 
00293     SG_UNREF(alphabet);
00294     alphabet=sf_l->get_alphabet();
00295     CAlphabet* ralphabet=sf_r->get_alphabet();
00296 
00297     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00298         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00299 
00300     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00301     SG_UNREF(ralphabet);
00302 
00303 
00304     return init_normalizer();
00305 }
00306 
00307 void CSpectrumRBFKernel::cleanup()
00308 {
00309 
00310     SG_UNREF(alphabet);
00311     alphabet=NULL;
00312 
00313     CKernel::cleanup();
00314 }
00315 
00316 inline bool isaa(char c)
00317 {
00318   if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z')
00319     return false ;
00320   return true ;
00321 }
00322 
00323 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index)
00324 {
00325     //const char* AA = "ARNDCQEGHILKMFPSTWYV";
00326   float64_t diff=0.0 ;
00327   
00328   for (int i=0; i<seq_degree; i++)
00329     {
00330       if (!isaa(path[i])||!isaa(joint_seq[index+i]))
00331     diff+=1.4 ;
00332       else
00333     {
00334       diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ;
00335       diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
00336       diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
00337       if (CMath::is_nan(diff))
00338         fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
00339     }
00340     }
00341   
00342   return exp( - diff/width) ;
00343 }
00344 
00345 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b)
00346 {
00347     int32_t alen, blen;
00348     bool afree, bfree;
00349 
00350     char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree);
00351     char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree);
00352 
00353     float64_t result=0;
00354     for (int32_t i=0; i<alen; i++)
00355       {
00356         for (int32_t j=0; j<blen; j++)
00357           {
00358         if ((i+degree<=alen) && (j+degree<=blen))
00359           result += AA_helper(&(avec[i]), degree, bvec, j) ;
00360           }
00361       }
00362 
00363     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree);
00364     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00365     return result;
00366 }
00367 
00368 bool CSpectrumRBFKernel::set_AA_matrix(
00369     float64_t* AA_matrix_)
00370 {
00371 
00372     if (AA_matrix_)
00373     {
00374         SG_DEBUG("Setting AA_matrix\n") ;
00375         memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ;
00376         return true ;
00377     }
00378 
00379     return false;
00380 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation