StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014 
00015 #include "preproc/PreProc.h"
00016 #include "preproc/StringPreProc.h"
00017 #include "features/Features.h"
00018 #include "features/SimpleFeatures.h"
00019 #include "features/Alphabet.h"
00020 #include "lib/common.h"
00021 #include "lib/io.h"
00022 #include "lib/DynamicArray.h"
00023 #include "lib/File.h"
00024 #include "lib/MemoryMappedFile.h"
00025 #include "lib/Mathematics.h"
00026 
00027 #include <sys/types.h>
00028 #include <sys/stat.h>
00029 #include <dirent.h>
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033 
00034 class CFile;
00035 
00036 template <class ST> class CStringPreProc;
00037 
00038 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00039 
00040 template <class T> struct T_STRING
00041 {
00043     T* string;
00045     int32_t length;
00046 };
00047 #endif // DOXYGEN_SHOULD_SKIP_THIS
00048 
00049 template <class T> char* get_zero_terminated_string_copy(T_STRING<T> str)
00050 {
00051     int32_t l=str.length;
00052     char* s=new char[l+1];
00053     memcpy(s, str.string, sizeof(char)*l);
00054     s[l]='\0';
00055     return s;
00056 }
00057 
00071 template <class ST> class CStringFeatures : public CFeatures
00072 {
00073     public:
00078         CStringFeatures(EAlphabet alpha)
00079         : CFeatures(0), num_vectors(0), features(NULL),
00080             single_string(NULL),length_of_single_string(0),
00081             max_string_length(0), order(0), symbol_mask_table(NULL)
00082         {
00083             alphabet=new CAlphabet(alpha);
00084             SG_REF(alphabet);
00085             num_symbols=alphabet->get_num_symbols();
00086             original_num_symbols=num_symbols;
00087         }
00088 
00096         CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00097                 int32_t p_max_string_length, EAlphabet alpha)
00098         : CFeatures(0), num_vectors(0), features(NULL),
00099             single_string(NULL),length_of_single_string(0),
00100             max_string_length(0), order(0), symbol_mask_table(NULL)
00101         {
00102             alphabet=new CAlphabet(alpha);
00103             SG_REF(alphabet);
00104             num_symbols=alphabet->get_num_symbols();
00105             original_num_symbols=num_symbols;
00106             set_features(p_features, p_num_vectors, p_max_string_length);
00107         }
00108 
00113         CStringFeatures(CAlphabet* alpha)
00114         : CFeatures(0), num_vectors(0), features(NULL),
00115             single_string(NULL),length_of_single_string(0),
00116             max_string_length(0), order(0), symbol_mask_table(NULL)
00117         {
00118             ASSERT(alpha);
00119             SG_REF(alpha);
00120             alphabet=alpha;
00121             num_symbols=alphabet->get_num_symbols();
00122             original_num_symbols=num_symbols;
00123         }
00124 
00126         CStringFeatures(const CStringFeatures & orig)
00127         : CFeatures(orig), num_vectors(orig.num_vectors),
00128             single_string(orig.single_string),
00129             length_of_single_string(orig.length_of_single_string),
00130             max_string_length(orig.max_string_length),
00131             num_symbols(orig.num_symbols),
00132             original_num_symbols(orig.original_num_symbols),
00133             order(orig.order)
00134         {
00135             ASSERT(orig.single_string == NULL); //not implemented
00136 
00137             alphabet=orig.alphabet;
00138             SG_REF(alphabet);
00139 
00140             if (orig.features)
00141             {
00142                 features=new T_STRING<ST>[orig.num_vectors];
00143 
00144                 for (int32_t i=0; i<num_vectors; i++)
00145                 {
00146                     features[i].string=new ST[orig.features[i].length];
00147                     ASSERT(features[i].string);
00148                     features[i].length=orig.features[i].length;
00149                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length); 
00150                 }
00151             }
00152 
00153             if (orig.symbol_mask_table)
00154             {
00155                 symbol_mask_table=new ST[256];
00156                 for (int32_t i=0; i<256; i++)
00157                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00158             }
00159         }
00160 
00166         CStringFeatures(char* fname, EAlphabet alpha=DNA)
00167         : CFeatures(fname), num_vectors(0),
00168             features(NULL), single_string(NULL),
00169             length_of_single_string(0), max_string_length(0),
00170             order(0), symbol_mask_table(NULL)
00171         {
00172             alphabet=new CAlphabet(alpha);
00173             SG_REF(alphabet);
00174             num_symbols=alphabet->get_num_symbols();
00175             original_num_symbols=num_symbols;
00176             load(fname);
00177         }
00178 
00179         virtual ~CStringFeatures()
00180         {
00181             cleanup();
00182 
00183             SG_UNREF(alphabet);
00184         }
00185 
00187         void cleanup()
00188         {
00189             if (single_string)
00190             {
00191                 delete[] single_string;
00192                 single_string=NULL;
00193             }
00194             else
00195             {
00196                 for (int32_t i=0; i<num_vectors; i++)
00197                 {
00198                     delete[] features[i].string;
00199                     features[i].length=0;
00200                 }
00201             }
00202 
00203             num_vectors=0;
00204             delete[] features;
00205             delete[] symbol_mask_table;
00206             features=NULL;
00207             symbol_mask_table=NULL;
00208 
00209             /* start with a fresh alphabet, but instead of emptying the histogram
00210              * create a new object (to leave the alphabet object alone if it is used
00211              * by others) 
00212              */
00213             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00214             SG_UNREF(alphabet);
00215             alphabet=alpha;
00216             SG_REF(alphabet);
00217         }
00218 
00223         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00224 
00229         inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00230 
00235         inline CAlphabet* get_alphabet()
00236         {
00237             SG_REF(alphabet);
00238             return alphabet;
00239         }
00240 
00245         virtual CFeatures* duplicate() const
00246         {
00247             return new CStringFeatures<ST>(*this);
00248         }
00249 
00256         void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00257         {
00258             ASSERT(features);
00259             if (num>=num_vectors)
00260             {
00261                 SG_ERROR("Index out of bounds (number of strings %d, you "
00262                         "requested %d)\n", num_vectors, num);
00263             }
00264 
00265             *len=features[num].length;
00266             *dst=(ST*) malloc(*len * sizeof(ST));
00267             memcpy(*dst, features[num].string, *len * sizeof(ST));
00268         }
00269 
00276         void set_feature_vector(ST* src, int32_t len, int32_t num)
00277         {
00278             ASSERT(features);
00279             if (num>=num_vectors)
00280             {
00281                 SG_ERROR("Index out of bounds (number of strings %d, you "
00282                         "requested %d)\n", num_vectors, num);
00283             }
00284 
00285             if (len<=0)
00286                 SG_ERROR("String has zero or negative length\n");
00287 
00288 
00289             delete[] features[num].string;
00290             features[num].length=len;
00291             features[num].string=new ST[len];
00292             memcpy(features[num].string, src, len*sizeof(ST));
00293 
00294             determine_maximum_string_length();
00295         }
00296 
00303         virtual ST* get_feature_vector(int32_t num, int32_t& len)
00304         {
00305             ASSERT(features);
00306             ASSERT(num<num_vectors);
00307 
00308             len=features[num].length;
00309             return features[num].string;
00310         }
00311 
00318         virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00319         {
00320             ASSERT(features && vec_num<num_vectors);
00321             ASSERT(feat_num<features[vec_num].length);
00322 
00323             return features[vec_num].string[feat_num];
00324         }
00325 
00331         virtual inline int32_t get_vector_length(int32_t vec_num)
00332         {
00333             ASSERT(features && vec_num<num_vectors);
00334             return features[vec_num].length;
00335         }
00336 
00341         virtual inline int32_t get_max_vector_length()
00342         {
00343             return max_string_length;
00344         }
00345 
00350         virtual inline int32_t get_num_vectors() { return num_vectors; }
00351 
00358         inline floatmax_t get_num_symbols() { return num_symbols; }
00359 
00367         inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00368 
00369         // these functions are necessary to find out about a former conversion process
00370 
00375         inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00376 
00381         inline int32_t get_order() { return order; }
00382 
00390         inline ST get_masked_symbols(ST symbol, uint8_t mask)
00391         {
00392             ASSERT(symbol_mask_table);
00393             return symbol_mask_table[mask] & symbol;
00394         }
00395 
00402         inline ST shift_offset(ST offset, int32_t amount)
00403         {
00404             ASSERT(alphabet);
00405             return (offset << (amount*alphabet->get_num_bits()));
00406         }
00407 
00414         inline ST shift_symbol(ST symbol, int32_t amount)
00415         {
00416             ASSERT(alphabet);
00417             return (symbol >> (amount*alphabet->get_num_bits()));
00418         }
00419 
00425         virtual bool load(char* fname)
00426         {
00427             SG_INFO( "loading...\n");
00428             int64_t length=0;
00429             max_string_length=0;
00430 
00431             CFile f(fname, 'r', F_CHAR);
00432             char* feature_matrix=f.load_char_data(NULL, length);
00433 
00434             num_vectors=0;
00435 
00436             if (f.is_ok())
00437             {
00438                 for (int64_t i=0; i<length; i++)
00439                 {
00440                     if (feature_matrix[i]=='\n')
00441                         num_vectors++;
00442                 }
00443 
00444                 SG_INFO( "file contains %ld vectors\n", num_vectors);
00445                 features= new T_STRING<ST>[num_vectors];
00446 
00447                 int64_t index=0;
00448                 for (int32_t lines=0; lines<num_vectors; lines++)
00449                 {
00450                     char* p=&feature_matrix[index];
00451                     int32_t columns=0;
00452 
00453                     for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00454 
00455                     if (index+columns>=length && p[columns]!='\n') {
00456                         SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00457                     }
00458 
00459                     features[lines].length=columns;
00460                     features[lines].string=new ST[columns];
00461 
00462                     max_string_length=CMath::max(max_string_length,columns);
00463 
00464                     for (int32_t i=0; i<columns; i++)
00465                         features[lines].string[i]= ((ST) p[i]);
00466 
00467                     index+= features[lines].length+1;
00468                 }
00469 
00470                 num_symbols=4; //FIXME
00471                 return true;
00472             }
00473             else
00474                 SG_ERROR( "reading file failed\n");
00475 
00476             return false;
00477         }
00478 
00485         bool load_dna_file(char* fname, bool remap_to_bin=true)
00486         {
00487             bool result=false;
00488 
00489             size_t blocksize=1024*1024;
00490             size_t required_blocksize=0;
00491             uint8_t* dummy=new uint8_t[blocksize];
00492             uint8_t* overflow=NULL;
00493             int32_t overflow_len=0;
00494 
00495             num_symbols=4;
00496             cleanup();
00497 
00498             CAlphabet* alpha=new CAlphabet(DNA);
00499             CAlphabet* alpha_bin=new CAlphabet(RAWDNA);
00500 
00501             FILE* f=fopen(fname, "ro");
00502 
00503             if (f)
00504             {
00505                 num_vectors=0;
00506                 max_string_length=0;
00507 
00508                 SG_INFO("counting line numbers in file %s\n", fname);
00509                 size_t block_offs=0;
00510                 size_t old_block_offs=0;
00511                 fseek(f, 0, SEEK_END);
00512                 size_t fsize=ftell(f);
00513                 rewind(f);
00514 
00515                 if (blocksize>fsize)
00516                     blocksize=fsize;
00517 
00518                 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00519 
00520                 size_t sz=blocksize;
00521                 while (sz == blocksize)
00522                 {
00523                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00524                     bool contains_cr=false;
00525                     for (size_t i=0; i<sz; i++)
00526                     {
00527                         block_offs++;
00528                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00529                         {
00530                             num_vectors++;
00531                             contains_cr=true;
00532                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00533                             old_block_offs=block_offs;
00534                         }
00535                     }
00536                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00537                 }
00538 
00539                 SG_INFO("found %d strings\n", num_vectors);
00540                 delete[] dummy;
00541                 blocksize=required_blocksize;
00542                 dummy = new uint8_t[blocksize];
00543                 overflow = new uint8_t[blocksize];
00544                 features=new T_STRING<ST>[num_vectors];
00545 
00546                 rewind(f);
00547                 sz=blocksize;
00548                 int32_t lines=0;
00549                 while (sz == blocksize)
00550                 {
00551                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00552 
00553                     size_t old_sz=0;
00554                     for (size_t i=0; i<sz; i++)
00555                     {
00556                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00557                         {
00558                             int32_t len=i-old_sz;
00559                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00560                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00561 
00562                             features[lines].length=len;
00563                             features[lines].string=new ST[len];
00564 
00565                             if (remap_to_bin)
00566                             {
00567                                 for (int32_t j=0; j<overflow_len; j++)
00568                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00569                                 for (int32_t j=0; j<len; j++)
00570                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00571                                 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00572                             }
00573                             else
00574                             {
00575                                 for (int32_t j=0; j<overflow_len; j++)
00576                                     features[lines].string[j]=overflow[j];
00577                                 for (int32_t j=0; j<len; j++)
00578                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00579                                 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00580                             }
00581 
00582                             // clear overflow
00583                             overflow_len=0;
00584 
00585                             //CMath::display_vector(features[lines].string, len);
00586                             old_sz=i+1;
00587                             lines++;
00588                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00589                         }
00590                     }
00591                     for (size_t i=old_sz; i<sz; i++)
00592                         overflow[i-old_sz]=dummy[i];
00593 
00594                     overflow_len=sz-old_sz;
00595                 }
00596                 result=true;
00597                 SG_INFO("file successfully read\n");
00598                 SG_INFO("max_string_length=%d\n", max_string_length);
00599                 SG_INFO("num_strings=%d\n", num_vectors);
00600             }
00601 
00602             fclose(f);
00603             delete[] dummy;
00604 
00605             SG_UNREF(alphabet);
00606 
00607             if (remap_to_bin)
00608                 alphabet = alpha_bin;
00609             else
00610                 alphabet = alpha;
00611             SG_REF(alphabet);
00612 
00613             return result;
00614         }
00615 
00622         bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00623         {
00624             int32_t i=0;
00625             uint64_t len=0;
00626             uint64_t offs=0;
00627             int32_t num=0;
00628             int32_t max_len=0;
00629 
00630             CMemoryMappedFile<char> f(fname);
00631 
00632             while (true)
00633             {
00634                 char* s=f.get_line(len, offs);
00635                 if (!s)
00636                     break;
00637 
00638                 if (len>0 && s[0]=='>')
00639                     num++;
00640             }
00641 
00642             if (num==0)
00643                 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00644 
00645             cleanup();
00646             SG_UNREF(alphabet);
00647             alphabet=new CAlphabet(DNA);
00648 
00649             T_STRING<ST>* strings=new T_STRING<ST>[num];
00650             offs=0;
00651 
00652             for (i=0;i<num; i++)
00653             {
00654                 uint64_t id_len=0;
00655                 char* id=f.get_line(id_len, offs);
00656 
00657                 char* fasta=f.get_line(len, offs);
00658                 char* s=fasta;
00659                 int32_t fasta_len=0;
00660                 int32_t spanned_lines=0;
00661 
00662                 while (true)
00663                 {
00664                     if (!s || len==0)
00665                         SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00666 
00667                     if (s[0]=='>' || offs==f.get_size())
00668                     {
00669                         offs-=len+1; // seek to beginning
00670                         if (offs==f.get_size())
00671                         {
00672                             SG_DEBUG("at EOF\n");
00673                             fasta_len+=len;
00674                         }
00675 
00676                         len = fasta_len-spanned_lines;
00677                         strings[i].string=new ST[len];
00678                         strings[i].length=len;
00679 
00680                         ST* str=strings[i].string;
00681                         int32_t idx=0;
00682                         SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00683 
00684                         for (int32_t j=0; j<fasta_len; j++)
00685                         {
00686                             if (fasta[j]=='\n')
00687                                 continue;
00688 
00689                             ST c = (ST) fasta[j];
00690 
00691                             if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00692                                 c = (ST) 'A';
00693 
00694                             if (idx>=len)
00695                                 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00696                             str[idx++]=c;
00697                         }
00698                         max_len=CMath::max(max_len, strings[i].length);
00699 
00700 
00701                         break;
00702                     }
00703 
00704                     spanned_lines++;
00705                     fasta_len+=len+1; // including '\n'
00706                     s=f.get_line(len, offs);
00707                 }
00708             }
00709 
00710             return set_features(strings, num, max_len);
00711         }
00712 
00720         bool load_fastq_file(const char* fname,
00721                 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00722         {
00723             CMemoryMappedFile<char> f(fname);
00724             
00725             int32_t i=0;
00726             uint64_t len=0;
00727             uint64_t offs=0;
00728 
00729             int32_t num=f.get_num_lines();
00730             int32_t max_len=0;
00731 
00732             if (num%4)
00733                 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00734             num/=4;
00735 
00736             cleanup();
00737             SG_UNREF(alphabet);
00738             alphabet=new CAlphabet(DNA);
00739 
00740             T_STRING<ST>* strings;
00741             
00742             ST* str;
00743             if (bitremap_in_single_string)
00744             {
00745                 strings=new T_STRING<ST>[1];
00746                 strings[0].string=new ST[num];
00747                 strings[0].length=num;
00748                 f.get_line(len, offs);
00749                 f.get_line(len, offs);
00750                 order=len;
00751                 max_len=num;
00752                 offs=0;
00753                 original_num_symbols=alphabet->get_num_symbols();
00754                 int32_t max_val=alphabet->get_num_bits();
00755                 str=new ST[len];
00756             }
00757             else
00758                 strings=new T_STRING<ST>[num];
00759 
00760             for (i=0;i<num; i++)
00761             {
00762                 if (!f.get_line(len, offs))
00763                     SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00764 
00765                 char* s=f.get_line(len, offs);
00766                 if (!s || len==0)
00767                     SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00768 
00769                 if (bitremap_in_single_string)
00770                 {
00771                     if (len!=order)
00772                         SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00773                     for (int32_t j=0; j<order; j++)
00774                         str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00775 
00776                     strings[0].string[i]=embed_word(str, order);
00777                 }
00778                 else
00779                 {
00780                     strings[i].string=new ST[len];
00781                     strings[i].length=len;
00782                     str=strings[i].string;
00783 
00784                     if (ignore_invalid)
00785                     {
00786                         for (int32_t j=0; j<len; j++)
00787                         {
00788                             if (alphabet->is_valid((uint8_t) s[j]))
00789                                 str[j]= (ST) s[j];
00790                             else
00791                                 str[j]= (ST) 'A';
00792                         }
00793                     }
00794                     else
00795                     {
00796                         for (int32_t j=0; j<len; j++)
00797                             str[j]= (ST) s[j];
00798                     }
00799                     max_len=CMath::max(max_len, (int32_t) len);
00800                 }
00801 
00802 
00803                 if (!f.get_line(len, offs))
00804                     SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00805 
00806                 if (!f.get_line(len, offs))
00807                     SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00808             }
00809 
00810             if (bitremap_in_single_string)
00811                 num=1;
00812 
00813             num_vectors=num;
00814             max_string_length=max_len;
00815             features=strings;
00816 
00817             return true;
00818         }
00819 
00825         bool load_from_directory(char* dirname)
00826         {
00827             struct dirent **namelist;
00828             int32_t n;
00829 
00830             CIO::set_dirname(dirname);
00831 
00832             SG_DEBUG("dirname '%s'\n", dirname);
00833 
00834             n = scandir(dirname, &namelist, &CIO::filter, alphasort);
00835             if (n <= 0)
00836             {
00837                 SG_ERROR("error calling scandir - no files found\n");
00838                 return false;
00839             }
00840             else
00841             {
00842                 T_STRING<ST>* strings=NULL;
00843 
00844                 int32_t num=0;
00845                 int32_t max_len=-1;
00846 
00847                 //usually n==num_vec, but it might not in race conditions 
00848                 //(file perms modified, file erased)
00849                 strings=new T_STRING<ST>[n];
00850 
00851                 for (int32_t i=0; i<n; i++)
00852                 {
00853                     char* fname=CIO::concat_filename(namelist[i]->d_name);
00854 
00855                     struct stat s;
00856                     off_t filesize=0;
00857 
00858                     if (!stat(fname, &s) && s.st_size>0)
00859                     {
00860                         filesize=s.st_size/sizeof(ST);
00861 
00862                         FILE* f=fopen(fname, "ro");
00863                         if (f)
00864                         {
00865                             ST* str=new ST[filesize];
00866                             SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00867                             fread(str, sizeof(ST), filesize, f);
00868                             strings[num].string=str;
00869                             strings[num].length=filesize;
00870                             max_len=CMath::max(max_len, strings[num].length);
00871 
00872                             num++;
00873                             fclose(f);
00874                         }
00875                     }
00876                     else
00877                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
00878 
00879                     free(namelist[i]);
00880                 }
00881                 free(namelist);
00882 
00883                 if (num>0 && strings)
00884                 {
00885                     set_features(strings, num, max_len);
00886                     return true;
00887                 }
00888             }
00889             return false;
00890         }
00891 
00899         bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00900         {
00901             if (p_features)
00902             {
00903                 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00904 
00905                 //compute histogram for char/byte
00906                 for (int32_t i=0; i<p_num_vectors; i++)
00907                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
00908 
00909                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00910                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00911 
00912                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00913                 {
00914                     cleanup();
00915                     SG_UNREF(alphabet);
00916 
00917                     alphabet=alpha;
00918                     SG_REF(alphabet);
00919 
00920                     this->features=p_features;
00921                     this->num_vectors=p_num_vectors;
00922                     this->max_string_length=p_max_string_length;
00923 
00924                     return true;
00925                 }
00926                 else
00927                     SG_UNREF(alpha);
00928             }
00929 
00930             return false;
00931         }
00932 
00939         virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
00940         {
00941             num_str=num_vectors;
00942             max_str_len=max_string_length;
00943             return features;
00944         }
00945 
00951         virtual void get_features(T_STRING<ST>** dst, int32_t* num_str)
00952         {
00953             *num_str=num_vectors;
00954             *dst=features;
00955         }
00956 
00962         virtual bool save(char* dest)
00963         {
00964             return false;
00965         }
00966 
00971         virtual int32_t get_size() { return sizeof(ST); }
00972 
00978         virtual bool apply_preproc(bool force_preprocessing=false)
00979         {
00980             SG_DEBUG( "force: %d\n", force_preprocessing);
00981 
00982             for (int32_t i=0; i<get_num_preproc(); i++)
00983             { 
00984                 if ( (!is_preprocessed(i) || force_preprocessing) )
00985                 {
00986                     set_preprocessed(i);
00987                     CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00988                     SG_INFO( "preprocessing using preproc %s\n", p->get_name());
00989 
00990                     if (!p->apply_to_string_features(this))
00991                     {
00992                         SG_UNREF(p);
00993                         return false;
00994                     }
00995                     else 
00996                         SG_UNREF(p);
00997                 }
00998             }
00999             return true;
01000         }
01001 
01011         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01012         {
01013             ASSERT(step_size>0);
01014             ASSERT(window_size>0);
01015             ASSERT(num_vectors==1 || single_string);
01016             ASSERT(max_string_length>=window_size ||
01017                     (single_string && length_of_single_string>=window_size));
01018 
01019             //in case we are dealing with a single remapped string
01020             //allow remapping
01021             if (single_string)
01022                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01023             else if (num_vectors==1)
01024             {
01025                 num_vectors= (max_string_length-window_size)/step_size + 1;
01026                 length_of_single_string=max_string_length;
01027             }
01028 
01029             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01030             int32_t offs=0;
01031             for (int32_t i=0; i<num_vectors; i++)
01032             {
01033                 f[i].string=&features[0].string[offs+skip];
01034                 f[i].length=window_size-skip;
01035                 offs+=step_size;
01036             }
01037             single_string=features[0].string;
01038             delete[] features;
01039             features=f;
01040             max_string_length=window_size-skip;
01041 
01042             return num_vectors;
01043         }
01044 
01053         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01054         {
01055             ASSERT(positions);
01056             ASSERT(window_size>0);
01057             ASSERT(num_vectors==1 || single_string);
01058             ASSERT(max_string_length>=window_size ||
01059                     (single_string && length_of_single_string>=window_size));
01060 
01061             num_vectors= positions->get_num_elements();
01062             ASSERT(num_vectors>0);
01063 
01064             int32_t len;
01065 
01066             //in case we are dealing with a single remapped string
01067             //allow remapping
01068             if (single_string)
01069                 len=length_of_single_string;
01070             else
01071             {
01072                 single_string=features[0].string;
01073                 len=max_string_length;
01074                 length_of_single_string=max_string_length;
01075             }
01076 
01077             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01078             for (int32_t i=0; i<num_vectors; i++)
01079             {
01080                 int32_t p=positions->get_element(i);
01081 
01082                 if (p>=0 && p<=len-window_size)
01083                 {
01084                     f[i].string=&features[0].string[p+skip];
01085                     f[i].length=window_size-skip;
01086                 }
01087                 else
01088                 {
01089                     num_vectors=1;
01090                     max_string_length=len;
01091                     features[0].length=len;
01092                     single_string=NULL;
01093                     delete[] f;
01094                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01095                             window_size, i, p, len);
01096                     return -1;
01097                 }
01098             }
01099 
01100             delete[] features;
01101             features=f;
01102             max_string_length=window_size-skip;
01103 
01104             return num_vectors;
01105         }
01106 
01118         inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01119         {
01120             return obtain_from_char_features(sf, start, p_order, gap, rev);
01121         }
01122 
01132         template <class CT>
01133             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01134             {
01135                 ASSERT(sf);
01136 
01137                 CAlphabet* alpha=sf->get_alphabet();
01138                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01139 
01140                 this->order=p_order;
01141                 cleanup();
01142 
01143                 num_vectors=sf->get_num_vectors();
01144                 ASSERT(num_vectors>0);
01145                 max_string_length=sf->get_max_vector_length()-start;
01146                 features=new T_STRING<ST>[num_vectors];
01147 
01148                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01149                         alpha->get_num_symbols_in_histogram());
01150 
01151                 for (int32_t i=0; i<num_vectors; i++)
01152                 {
01153                     int32_t len=-1;
01154                     CT* c=sf->get_feature_vector(i, len);
01155 
01156                     features[i].string=new ST[len];
01157                     features[i].length=len;
01158 
01159                     ST* str=features[i].string;
01160                     for (int32_t j=0; j<len; j++)
01161                         str[j]=(ST) alpha->remap_to_bin(c[j]);
01162 
01163                 }
01164 
01165                 original_num_symbols=alpha->get_num_symbols();
01166                 int32_t max_val=alpha->get_num_bits();
01167 
01168                 SG_UNREF(alpha);
01169 
01170                 if (p_order>1)
01171                     num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01172                 else
01173                     num_symbols=original_num_symbols;
01174                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01175 
01176                 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01177                 {
01178                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01179                     return false;
01180                 }
01181 
01182                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01183                 for (int32_t line=0; line<num_vectors; line++)
01184                 {
01185                     int32_t len=0;
01186                     ST* fv=get_feature_vector(line, len);
01187 
01188                     if (rev)
01189                         translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01190                     else
01191                         translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01192 
01193                     /* fix the length of the string -- hacky */
01194                     features[line].length-=start+gap ;
01195                     if (features[line].length<0)
01196                         features[line].length=0 ;
01197                 }         
01198 
01199                 compute_symbol_mask_table(max_val);
01200 
01201                 return true;
01202             }
01203 
01211         bool have_same_length(int32_t len=-1)
01212         {
01213             if (len!=-1)
01214             {
01215                 if (len!=get_max_vector_length())
01216                     return false;
01217             }
01218             len = get_max_vector_length();
01219 
01220             for (int32_t i=0; i<num_vectors; i++)
01221             {
01222                 if (get_vector_length(i)!=len)
01223                     return false;
01224             }
01225 
01226             return true;
01227         }
01228 
01233         inline void embed_features(int32_t p_order)
01234         {
01235             ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01236 
01237             order=p_order;
01238             original_num_symbols=alphabet->get_num_symbols();
01239             int32_t max_val=alphabet->get_num_bits();
01240 
01241             if (p_order>1)
01242                 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01243             else
01244                 num_symbols=original_num_symbols;
01245 
01246             SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01247 
01248             if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01249                 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01250 
01251             ST mask=0;
01252             for (int32_t i=0; i<p_order*max_val; i++)
01253                 mask= (mask<<1) | ((ST) 1);
01254 
01255             for (int32_t i=0; i<num_vectors; i++)
01256             {
01257                 int32_t len=features[i].length;
01258 
01259                 if (len < p_order)
01260                     SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01261 
01262                 ST* str = features[i].string;
01263 
01264                 // convert first word
01265                 for (int32_t j=0; j<p_order; j++)
01266                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01267                 str[0]=embed_word(&str[0], p_order);
01268 
01269                 // convert the rest
01270                 int32_t idx=0;
01271                 for (int32_t j=p_order; j<len; j++)
01272                 {
01273                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01274                     str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01275                     idx++;
01276                 }
01277 
01278                 features[i].length=len-p_order+1;
01279             }
01280 
01281             compute_symbol_mask_table(max_val);
01282         }
01283 
01288         inline void compute_symbol_mask_table(int64_t max_val)
01289         {
01290             delete[] symbol_mask_table;
01291             symbol_mask_table=new ST[256];
01292 
01293             uint64_t mask=0;
01294             for (int32_t i=0; i< (int64_t) max_val; i++)
01295                 mask=(mask<<1) | 1;
01296 
01297             for (int32_t i=0; i<256; i++)
01298             {
01299                 uint8_t bits=(uint8_t) i;
01300                 symbol_mask_table[i]=0;
01301 
01302                 for (int32_t j=0; j<8; j++)
01303                 {
01304                     if (bits & 1)
01305                         symbol_mask_table[i]|=mask<<(max_val*j);
01306 
01307                     bits>>=1;
01308                 }
01309             }
01310         }
01311 
01318         inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01319         {
01320             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01321 
01322             ST mask=0;
01323             for (int32_t i=0; i<nbits; i++)
01324                 mask=(mask<<1) | (ST) 1;
01325 
01326             for (int32_t i=0; i<len; i++)
01327             {
01328                 ST w=(word & mask);
01329                 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01330                 word>>=nbits;
01331             }
01332         }
01333 
01339         inline ST embed_word(ST* seq, int32_t len)
01340         {
01341             ST value=(ST) 0;
01342             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01343             for (int32_t i=0; i<len; i++)
01344             {
01345                 value<<=nbits;
01346                 value|=seq[i];
01347             }
01348 
01349             return value;
01350         }
01351 
01354         void determine_maximum_string_length()
01355         {
01356             max_string_length=0;
01357 
01358             for (int32_t i=0; i<num_vectors; i++)
01359                 max_string_length=CMath::max(max_string_length, features[i].length);
01360         }
01361 
01363         inline virtual const char* get_name() const { return "StringFeatures"; }
01364 
01365     protected:
01366 
01375         void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
01376         {
01377             int32_t i,j;
01378             ST value=0;
01379 
01380             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
01381             {
01382                 value=0;
01383                 for (j=i; j>=i-p_order+1; j--)
01384                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
01385 
01386                 obs[i]= (ST) value;
01387             }
01388 
01389             for (i=p_order-2;i>=0;i--)
01390             {
01391                 if (i>=sequence_length)
01392                     continue;
01393 
01394                 value=0;
01395                 for (j=i; j>=i-p_order+1; j--)
01396                 {
01397                     value= (value >> max_val);
01398                     if (j>=0 && j<sequence_length)
01399                         value|=obs[j] << (max_val * (p_order-1));
01400                 }
01401                 obs[i]=value;
01402             }
01403 
01404             // TODO we should get rid of this loop!
01405             for (i=start; i<sequence_length; i++)
01406                 obs[i-start]=obs[i];
01407         }
01408 
01417         void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
01418         {
01419             int32_t i,j;
01420             ST value=0;
01421 
01422             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
01423             {
01424                 value=0;
01425                 for (j=i; j>=i-p_order+1; j--)
01426                     value= (value << max_val) | obs[j];
01427 
01428                 obs[i]= (ST) value;
01429             }
01430 
01431             for (i=p_order-2;i>=0;i--)
01432             {
01433                 if (i>=sequence_length)
01434                     continue;
01435 
01436                 value=0;
01437                 for (j=i; j>=i-p_order+1; j--)
01438                 {
01439                     value= (value << max_val);
01440                     if (j>=0 && j<sequence_length)
01441                         value|=obs[j];
01442                 }
01443                 obs[i]=value;
01444             }
01445 
01446             // TODO we should get rid of this loop!
01447             for (i=start; i<sequence_length; i++)
01448                 obs[i-start]=obs[i];
01449         }
01450 
01460         void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01461         {
01462             ASSERT(gap>=0);
01463 
01464             const int32_t start_gap=(p_order-gap)/2;
01465             const int32_t end_gap=start_gap+gap;
01466 
01467             int32_t i,j;
01468             ST value=0;
01469 
01470             // almost all positions
01471             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
01472             {
01473                 value=0;
01474                 for (j=i; j>=i-p_order+1; j--)
01475                 {
01476                     if (i-j<start_gap)
01477                     {
01478                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01479                     }
01480                     else if (i-j>=end_gap)
01481                     {
01482                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01483                     }
01484                 }
01485                 obs[i]= (ST) value;
01486             }
01487 
01488             // the remaining `order` positions
01489             for (i=p_order-2;i>=0;i--)
01490             {
01491                 if (i>=sequence_length)
01492                     continue;
01493 
01494                 value=0;
01495                 for (j=i; j>=i-p_order+1; j--)
01496                 {
01497                     if (i-j<start_gap)
01498                     {
01499                         value= (value >> max_val);
01500                         if (j>=0 && j<sequence_length)
01501                             value|=obs[j] << (max_val * (p_order-1-gap));
01502                     }
01503                     else if (i-j>=end_gap)
01504                     {
01505                         value= (value >> max_val);
01506                         if (j>=0 && j<sequence_length)
01507                             value|=obs[j] << (max_val * (p_order-1-gap));
01508                     }
01509                 }
01510                 obs[i]=value;
01511             }
01512 
01513             // TODO we should get rid of this loop!
01514             for (i=start; i<sequence_length; i++)
01515                 obs[i-start]=obs[i];
01516         }
01517 
01527         void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01528         {
01529             ASSERT(gap>=0);
01530 
01531             const int32_t start_gap=(p_order-gap)/2;
01532             const int32_t end_gap=start_gap+gap;
01533 
01534             int32_t i,j;
01535             ST value=0;
01536 
01537             // almost all positions
01538             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
01539             {
01540                 value=0;
01541                 for (j=i; j>=i-p_order+1; j--)
01542                 {
01543                     if (i-j<start_gap)
01544                         value= (value << max_val) | obs[j];
01545                     else if (i-j>=end_gap)
01546                         value= (value << max_val) | obs[j];
01547                 }
01548                 obs[i]= (ST) value;
01549             }
01550 
01551             // the remaining `order` positions
01552             for (i=p_order-2;i>=0;i--)
01553             {
01554                 if (i>=sequence_length)
01555                     continue;
01556 
01557                 value=0;
01558                 for (j=i; j>=i-p_order+1; j--)
01559                 {
01560                     if (i-j<start_gap)
01561                     {
01562                         value= value << max_val;
01563                         if (j>=0 && j<sequence_length)
01564                             value|=obs[j];
01565                     }
01566                     else if (i-j>=end_gap)
01567                     {
01568                         value= value << max_val;
01569                         if (j>=0 && j<sequence_length)
01570                             value|=obs[j];
01571                     }           
01572                 }
01573                 obs[i]=value;
01574             }
01575 
01576             // TODO we should get rid of this loop!
01577             for (i=start; i<sequence_length; i++)
01578                 obs[i-start]=obs[i];
01579         }
01580 
01581     protected:
01582 
01589         virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01590         {
01591             ASSERT(features);
01592             ASSERT(num<num_vectors);
01593 
01594             features[num].length=len ;
01595             features[num].string=string ;
01596         }
01597 
01598 
01599     protected:
01600 
01602         CAlphabet* alphabet;
01603 
01605         int32_t num_vectors;
01606 
01608         T_STRING<ST>* features;
01609 
01611         ST* single_string;
01612 
01614         int32_t length_of_single_string;
01615 
01617         int32_t max_string_length;
01618 
01620         floatmax_t num_symbols;
01621 
01623         floatmax_t original_num_symbols;
01624 
01626         int32_t order;
01627 
01629         ST* symbol_mask_table;
01630 };
01631 
01632 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01633 
01637 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
01638 {
01639     return F_BOOL;
01640 }
01641 
01646 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
01647 {
01648     return F_CHAR;
01649 }
01650 
01655 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01656 {
01657     return F_BYTE;
01658 }
01659 
01664 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
01665 {
01666     return F_SHORT;
01667 }
01668 
01673 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01674 {
01675     return F_WORD;
01676 }
01677 
01682 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
01683 {
01684     return F_INT;
01685 }
01686 
01691 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01692 {
01693     return F_UINT;
01694 }
01695 
01700 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
01701 {
01702     return F_LONG;
01703 }
01704 
01709 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01710 {
01711     return F_ULONG;
01712 }
01713 
01718 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
01719 {
01720     return F_SHORTREAL;
01721 }
01722 
01727 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
01728 {
01729     return F_DREAL;
01730 }
01731 
01736 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
01737 {
01738     return F_LONGREAL;
01739 }
01740 
01741 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01742 {
01743     return symbol;
01744 }
01745 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01746 {
01747     return symbol;
01748 }
01749 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01750 {
01751     return symbol;
01752 }
01753 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01754 {
01755     return symbol;
01756 }
01757 
01758 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01759 {
01760     return false;
01761 }
01762 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01763 {
01764     return 0;
01765 }
01766 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01767 {
01768     return 0;
01769 }
01770 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01771 {
01772     return 0;
01773 }
01774 
01775 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01776 {
01777     return symbol;
01778 }
01779 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01780 {
01781     return symbol;
01782 }
01783 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01784 {
01785     return symbol;
01786 }
01787 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01788 {
01789     return symbol;
01790 }
01791 
01792 template<> inline void CStringFeatures<float32_t>::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01793 {
01794 }
01795 
01796 template<> inline void CStringFeatures<float64_t>::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01797 {
01798 }
01799 
01800 template<> inline void CStringFeatures<floatmax_t>::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01801 {
01802 }
01803 
01804 template<> inline void CStringFeatures<float32_t>::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01805 {
01806 }
01807 
01808 template<> inline void CStringFeatures<float64_t>::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01809 {
01810 }
01811 
01812 template<> inline void CStringFeatures<floatmax_t>::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01813 {
01814 }
01815 
01816 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01817 {
01818     return false;
01819 }
01820 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01821 {
01822     return false;
01823 }
01824 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01825 {
01826     return false;
01827 }
01828 
01829 template<>  inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01830 {
01831 }
01832 template<>  inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01833 {
01834 }
01835 template<>  inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01836 {
01837 }
01838 
01839 template<>  inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01840 {
01841 }
01842 template<>  inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01843 {
01844 }
01845 template<>  inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01846 {
01847 }
01848 
01849 template<>  inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01850 {
01851     return 0;
01852 }
01853 template<>  inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01854 {
01855     return 0;
01856 }
01857 template<>  inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01858 {
01859     return 0;
01860 }
01861 
01862 template<>  inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01863 {
01864 }
01865 template<>  inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01866 {
01867 }
01868 template<>  inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01869 {
01870 }
01871 #endif // DOXYGEN_SHOULD_SKIP_THIS
01872 #endif // _CSTRINGFEATURES__H__

SHOGUN Machine Learning Toolbox - Documentation