SNPFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "features/SNPFeatures.h"
00012 #include "lib/io.h"
00013 #include "features/Alphabet.h"
00014 
00015 using namespace shogun;
00016 
00017 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(),
00018     m_str_min(NULL), m_str_maj(NULL)
00019 {
00020     ASSERT(str);
00021     ASSERT(str->have_same_length());
00022     SG_REF(str);
00023 
00024     strings=str;
00025     string_length=str->get_max_vector_length();
00026     ASSERT((string_length & 1) == 0); // length divisible by 2
00027     w_dim=3*string_length/2;
00028     num_strings=str->get_num_vectors();
00029     CAlphabet* alpha=str->get_alphabet();
00030     ASSERT(alpha->get_alphabet()==SNP);
00031     SG_UNREF(alpha);
00032 
00033     obtain_base_strings();
00034     set_normalization_const();
00035 
00036 }
00037 
00038 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig)
00039     : CDotFeatures(orig), strings(orig.strings),
00040     normalization_const(orig.normalization_const),
00041     m_str_min(NULL), m_str_maj(NULL)
00042 {
00043     SG_REF(strings);
00044     string_length=strings->get_max_vector_length();
00045     ASSERT((string_length & 1) == 0); // length divisible by 2
00046     w_dim=3*string_length;
00047     num_strings=strings->get_num_vectors();
00048     CAlphabet* alpha=strings->get_alphabet();
00049     SG_UNREF(alpha);
00050 
00051     obtain_base_strings();
00052 }
00053 
00054 CSNPFeatures::~CSNPFeatures()
00055 {
00056     SG_UNREF(strings);
00057 }
00058 
00059 float64_t CSNPFeatures::dot(int32_t idx_a, int32_t idx_b)
00060 {
00061     int32_t alen, blen;
00062     bool free_avec, free_bvec;
00063 
00064     uint8_t* avec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(idx_a, alen, free_avec);
00065     uint8_t* bvec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(idx_b, blen, free_bvec);
00066 
00067     ASSERT(alen==blen);
00068     if (alen!=string_length)
00069         SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length);
00070     ASSERT(m_str_min);
00071     ASSERT(m_str_maj);
00072 
00073     float64_t total=0;
00074 
00075     for (int32_t i = 0; i<alen-1; i+=2)
00076     {
00077         int32_t sumaa=0;
00078         int32_t sumbb=0;
00079         int32_t sumab=0;
00080 
00081         uint8_t a1=avec[i];
00082         uint8_t a2=avec[i+1];
00083         uint8_t b1=bvec[i];
00084         uint8_t b2=bvec[i+1];
00085 
00086         if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00087             sumab++;
00088         else if (a1==a2 && b1==b2)
00089         {
00090             if (a1!=b1)
00091                 continue;
00092 
00093             if (a1==m_str_min[i])
00094                 sumaa++;
00095             else if (a1==m_str_maj[i])
00096                 sumbb++;
00097             else
00098             {
00099                 SG_ERROR("The impossible happened i=%d a1=%c "
00100                         "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]);
00101             }
00102 
00103         }
00104         total+=sumaa+sumbb+sumab;
00105     }
00106 
00107     ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(avec, idx_a, free_avec);
00108     ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(bvec, idx_b, free_bvec);
00109     return total;
00110 }
00111 
00112 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00113 {
00114     if (vec2_len != w_dim)
00115         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00116 
00117     float64_t sum=0;
00118     int32_t len;
00119     bool free_vec1;
00120     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00121     int32_t offs=0;
00122 
00123     for (int32_t i=0; i<len; i+=2)
00124     {
00125         int32_t dim=0;
00126 
00127         char a1=vec[i];
00128         char a2=vec[i+1];
00129 
00130         if (a1==a2 && a1!='0' && a2!='0')
00131         {
00132             if (a1==m_str_min[i])
00133                 dim=1;
00134             else if (a1==m_str_maj[i])
00135                 dim=2;
00136             else
00137             {
00138                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00139                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00140             }
00141         }
00142 
00143         sum+=vec2[offs+dim];
00144         offs+=3;
00145     }
00146     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00147 
00148     return sum/normalization_const;
00149 }
00150 
00151 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00152 {
00153     if (vec2_len != w_dim)
00154         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00155 
00156     int32_t len;
00157     bool free_vec1;
00158     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00159     int32_t offs=0;
00160 
00161     if (abs_val)
00162         alpha=CMath::abs(alpha);
00163 
00164     for (int32_t i=0; i<len; i+=2)
00165     {
00166         int32_t dim=0;
00167 
00168         char a1=vec[i];
00169         char a2=vec[i+1];
00170 
00171         if (a1==a2 && a1!='0' && a2!='0')
00172         {
00173             if (a1==m_str_min[i])
00174                 dim=1;
00175             else if (a1==m_str_maj[i])
00176                 dim=2;
00177             else
00178             {
00179                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00180                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00181             }
00182         }
00183 
00184         vec2[offs+dim]+=alpha;
00185         offs+=3;
00186     }
00187     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00188 }
00189 
00190 void CSNPFeatures::obtain_base_strings()
00191 {
00192     for (int32_t i=0; i<num_strings; i++)
00193     {
00194         int32_t len;
00195         bool free_vec;
00196         uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec);
00197         ASSERT(string_length==len);
00198 
00199         if (i==0)
00200         {
00201             size_t tlen=(len+1)*sizeof(uint8_t);
00202             m_str_min=(uint8_t*) malloc(tlen);
00203             m_str_maj=(uint8_t*) malloc(tlen);
00204             memset(m_str_min, 0, tlen);
00205             memset(m_str_maj, 0, tlen);
00206         }
00207 
00208         for (int32_t j=0; j<len; j++)
00209         {
00210             // skip sequencing errors
00211             if (vec[j]=='0')
00212                 continue;
00213 
00214             if (m_str_min[j]==0)
00215                 m_str_min[j]=vec[j];
00216             else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j])
00217                 m_str_maj[j]=vec[j];
00218         }
00219 
00220         ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec);
00221     }
00222 
00223     for (int32_t j=0; j<string_length; j++)
00224     {
00225         // if only one symbol occurs use 0
00226         if (m_str_min[j]==0)
00227             m_str_min[j]='0';
00228         if (m_str_maj[j]==0)
00229             m_str_maj[j]='0';
00230 
00231         if (m_str_min[j]>m_str_maj[j])
00232             CMath::swap(m_str_min[j], m_str_maj[j]);
00233     }
00234 }
00235 
00236 void CSNPFeatures::set_normalization_const(float64_t n)
00237 {
00238     if (n==0)
00239     {
00240         normalization_const=string_length;
00241         normalization_const=CMath::sqrt(normalization_const);
00242     }
00243     else
00244         normalization_const=n;
00245 
00246     SG_DEBUG("normalization_const:%f\n", normalization_const);
00247 }
00248 
00249 void* CSNPFeatures::get_feature_iterator(int32_t vector_index)
00250 {
00251     return NULL;
00252 }
00253 
00254 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00255 {
00256     return false;
00257 }
00258 
00259 void CSNPFeatures::free_feature_iterator(void* iterator)
00260 {
00261 }
00262 
00263 CFeatures* CSNPFeatures::duplicate() const
00264 {
00265     return new CSNPFeatures(*this);
00266 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation