00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/SNPFeatures.h"
00012 #include "lib/io.h"
00013 #include "features/Alphabet.h"
00014
00015 using namespace shogun;
00016
00017 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(),
00018 m_str_min(NULL), m_str_maj(NULL)
00019 {
00020 ASSERT(str);
00021 ASSERT(str->have_same_length());
00022 SG_REF(str);
00023
00024 strings=str;
00025 string_length=str->get_max_vector_length();
00026 ASSERT((string_length & 1) == 0);
00027 w_dim=3*string_length/2;
00028 num_strings=str->get_num_vectors();
00029 CAlphabet* alpha=str->get_alphabet();
00030 ASSERT(alpha->get_alphabet()==SNP);
00031 SG_UNREF(alpha);
00032
00033 obtain_base_strings();
00034 set_normalization_const();
00035
00036 }
00037
00038 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig)
00039 : CDotFeatures(orig), strings(orig.strings),
00040 normalization_const(orig.normalization_const),
00041 m_str_min(NULL), m_str_maj(NULL)
00042 {
00043 SG_REF(strings);
00044 string_length=strings->get_max_vector_length();
00045 ASSERT((string_length & 1) == 0);
00046 w_dim=3*string_length;
00047 num_strings=strings->get_num_vectors();
00048 CAlphabet* alpha=strings->get_alphabet();
00049 SG_UNREF(alpha);
00050
00051 obtain_base_strings();
00052 }
00053
00054 CSNPFeatures::~CSNPFeatures()
00055 {
00056 SG_UNREF(strings);
00057 }
00058
00059 float64_t CSNPFeatures::dot(int32_t idx_a, int32_t idx_b)
00060 {
00061 int32_t alen, blen;
00062 bool free_avec, free_bvec;
00063
00064 uint8_t* avec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(idx_a, alen, free_avec);
00065 uint8_t* bvec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(idx_b, blen, free_bvec);
00066
00067 ASSERT(alen==blen);
00068 if (alen!=string_length)
00069 SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length);
00070 ASSERT(m_str_min);
00071 ASSERT(m_str_maj);
00072
00073 float64_t total=0;
00074
00075 for (int32_t i = 0; i<alen-1; i+=2)
00076 {
00077 int32_t sumaa=0;
00078 int32_t sumbb=0;
00079 int32_t sumab=0;
00080
00081 uint8_t a1=avec[i];
00082 uint8_t a2=avec[i+1];
00083 uint8_t b1=bvec[i];
00084 uint8_t b2=bvec[i+1];
00085
00086 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00087 sumab++;
00088 else if (a1==a2 && b1==b2)
00089 {
00090 if (a1!=b1)
00091 continue;
00092
00093 if (a1==m_str_min[i])
00094 sumaa++;
00095 else if (a1==m_str_maj[i])
00096 sumbb++;
00097 else
00098 {
00099 SG_ERROR("The impossible happened i=%d a1=%c "
00100 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]);
00101 }
00102
00103 }
00104 total+=sumaa+sumbb+sumab;
00105 }
00106
00107 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(avec, idx_a, free_avec);
00108 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(bvec, idx_b, free_bvec);
00109 return total;
00110 }
00111
00112 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00113 {
00114 if (vec2_len != w_dim)
00115 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00116
00117 float64_t sum=0;
00118 int32_t len;
00119 bool free_vec1;
00120 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00121 int32_t offs=0;
00122
00123 for (int32_t i=0; i<len; i+=2)
00124 {
00125 int32_t dim=0;
00126
00127 char a1=vec[i];
00128 char a2=vec[i+1];
00129
00130 if (a1==a2 && a1!='0' && a2!='0')
00131 {
00132 if (a1==m_str_min[i])
00133 dim=1;
00134 else if (a1==m_str_maj[i])
00135 dim=2;
00136 else
00137 {
00138 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00139 i, a1,a2, m_str_min[i], m_str_maj[i]);
00140 }
00141 }
00142
00143 sum+=vec2[offs+dim];
00144 offs+=3;
00145 }
00146 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00147
00148 return sum/normalization_const;
00149 }
00150
00151 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00152 {
00153 if (vec2_len != w_dim)
00154 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00155
00156 int32_t len;
00157 bool free_vec1;
00158 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00159 int32_t offs=0;
00160
00161 if (abs_val)
00162 alpha=CMath::abs(alpha);
00163
00164 for (int32_t i=0; i<len; i+=2)
00165 {
00166 int32_t dim=0;
00167
00168 char a1=vec[i];
00169 char a2=vec[i+1];
00170
00171 if (a1==a2 && a1!='0' && a2!='0')
00172 {
00173 if (a1==m_str_min[i])
00174 dim=1;
00175 else if (a1==m_str_maj[i])
00176 dim=2;
00177 else
00178 {
00179 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00180 i, a1,a2, m_str_min[i], m_str_maj[i]);
00181 }
00182 }
00183
00184 vec2[offs+dim]+=alpha;
00185 offs+=3;
00186 }
00187 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00188 }
00189
00190 void CSNPFeatures::obtain_base_strings()
00191 {
00192 for (int32_t i=0; i<num_strings; i++)
00193 {
00194 int32_t len;
00195 bool free_vec;
00196 uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec);
00197 ASSERT(string_length==len);
00198
00199 if (i==0)
00200 {
00201 size_t tlen=(len+1)*sizeof(uint8_t);
00202 m_str_min=(uint8_t*) malloc(tlen);
00203 m_str_maj=(uint8_t*) malloc(tlen);
00204 memset(m_str_min, 0, tlen);
00205 memset(m_str_maj, 0, tlen);
00206 }
00207
00208 for (int32_t j=0; j<len; j++)
00209 {
00210
00211 if (vec[j]=='0')
00212 continue;
00213
00214 if (m_str_min[j]==0)
00215 m_str_min[j]=vec[j];
00216 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j])
00217 m_str_maj[j]=vec[j];
00218 }
00219
00220 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec);
00221 }
00222
00223 for (int32_t j=0; j<string_length; j++)
00224 {
00225
00226 if (m_str_min[j]==0)
00227 m_str_min[j]='0';
00228 if (m_str_maj[j]==0)
00229 m_str_maj[j]='0';
00230
00231 if (m_str_min[j]>m_str_maj[j])
00232 CMath::swap(m_str_min[j], m_str_maj[j]);
00233 }
00234 }
00235
00236 void CSNPFeatures::set_normalization_const(float64_t n)
00237 {
00238 if (n==0)
00239 {
00240 normalization_const=string_length;
00241 normalization_const=CMath::sqrt(normalization_const);
00242 }
00243 else
00244 normalization_const=n;
00245
00246 SG_DEBUG("normalization_const:%f\n", normalization_const);
00247 }
00248
00249 void* CSNPFeatures::get_feature_iterator(int32_t vector_index)
00250 {
00251 return NULL;
00252 }
00253
00254 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00255 {
00256 return false;
00257 }
00258
00259 void CSNPFeatures::free_feature_iterator(void* iterator)
00260 {
00261 }
00262
00263 CFeatures* CSNPFeatures::duplicate() const
00264 {
00265 return new CSNPFeatures(*this);
00266 }