00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014
00015 #include "preproc/PreProc.h"
00016 #include "preproc/StringPreProc.h"
00017 #include "features/Features.h"
00018 #include "features/SimpleFeatures.h"
00019 #include "features/Alphabet.h"
00020 #include "lib/common.h"
00021 #include "lib/io.h"
00022 #include "lib/DynamicArray.h"
00023 #include "lib/File.h"
00024 #include "lib/MemoryMappedFile.h"
00025 #include "lib/Mathematics.h"
00026
00027 #include <sys/types.h>
00028 #include <sys/stat.h>
00029 #include <dirent.h>
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033
00034 class CFile;
00035
00036 template <class ST> class CStringPreProc;
00037
00038 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00039
00040 template <class T> struct T_STRING
00041 {
00043 T* string;
00045 int32_t length;
00046 };
00047 #endif // DOXYGEN_SHOULD_SKIP_THIS
00048
00049 template <class T> char* get_zero_terminated_string_copy(T_STRING<T> str)
00050 {
00051 int32_t l=str.length;
00052 char* s=new char[l+1];
00053 memcpy(s, str.string, sizeof(char)*l);
00054 s[l]='\0';
00055 return s;
00056 }
00057
00071 template <class ST> class CStringFeatures : public CFeatures
00072 {
00073 public:
00078 CStringFeatures(EAlphabet alpha)
00079 : CFeatures(0), num_vectors(0), features(NULL),
00080 single_string(NULL),length_of_single_string(0),
00081 max_string_length(0), order(0), symbol_mask_table(NULL)
00082 {
00083 alphabet=new CAlphabet(alpha);
00084 SG_REF(alphabet);
00085 num_symbols=alphabet->get_num_symbols();
00086 original_num_symbols=num_symbols;
00087 }
00088
00096 CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00097 int32_t p_max_string_length, EAlphabet alpha)
00098 : CFeatures(0), num_vectors(0), features(NULL),
00099 single_string(NULL),length_of_single_string(0),
00100 max_string_length(0), order(0), symbol_mask_table(NULL)
00101 {
00102 alphabet=new CAlphabet(alpha);
00103 SG_REF(alphabet);
00104 num_symbols=alphabet->get_num_symbols();
00105 original_num_symbols=num_symbols;
00106 set_features(p_features, p_num_vectors, p_max_string_length);
00107 }
00108
00113 CStringFeatures(CAlphabet* alpha)
00114 : CFeatures(0), num_vectors(0), features(NULL),
00115 single_string(NULL),length_of_single_string(0),
00116 max_string_length(0), order(0), symbol_mask_table(NULL)
00117 {
00118 ASSERT(alpha);
00119 SG_REF(alpha);
00120 alphabet=alpha;
00121 num_symbols=alphabet->get_num_symbols();
00122 original_num_symbols=num_symbols;
00123 }
00124
00126 CStringFeatures(const CStringFeatures & orig)
00127 : CFeatures(orig), num_vectors(orig.num_vectors),
00128 single_string(orig.single_string),
00129 length_of_single_string(orig.length_of_single_string),
00130 max_string_length(orig.max_string_length),
00131 num_symbols(orig.num_symbols),
00132 original_num_symbols(orig.original_num_symbols),
00133 order(orig.order)
00134 {
00135 ASSERT(orig.single_string == NULL);
00136
00137 alphabet=orig.alphabet;
00138 SG_REF(alphabet);
00139
00140 if (orig.features)
00141 {
00142 features=new T_STRING<ST>[orig.num_vectors];
00143
00144 for (int32_t i=0; i<num_vectors; i++)
00145 {
00146 features[i].string=new ST[orig.features[i].length];
00147 ASSERT(features[i].string);
00148 features[i].length=orig.features[i].length;
00149 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00150 }
00151 }
00152
00153 if (orig.symbol_mask_table)
00154 {
00155 symbol_mask_table=new ST[256];
00156 for (int32_t i=0; i<256; i++)
00157 symbol_mask_table[i]=orig.symbol_mask_table[i];
00158 }
00159 }
00160
00166 CStringFeatures(char* fname, EAlphabet alpha=DNA)
00167 : CFeatures(fname), num_vectors(0),
00168 features(NULL), single_string(NULL),
00169 length_of_single_string(0), max_string_length(0),
00170 order(0), symbol_mask_table(NULL)
00171 {
00172 alphabet=new CAlphabet(alpha);
00173 SG_REF(alphabet);
00174 num_symbols=alphabet->get_num_symbols();
00175 original_num_symbols=num_symbols;
00176 load(fname);
00177 }
00178
00179 virtual ~CStringFeatures()
00180 {
00181 cleanup();
00182
00183 SG_UNREF(alphabet);
00184 }
00185
00187 void cleanup()
00188 {
00189 if (single_string)
00190 {
00191 delete[] single_string;
00192 single_string=NULL;
00193 }
00194 else
00195 {
00196 for (int32_t i=0; i<num_vectors; i++)
00197 {
00198 delete[] features[i].string;
00199 features[i].length=0;
00200 }
00201 }
00202
00203 num_vectors=0;
00204 delete[] features;
00205 delete[] symbol_mask_table;
00206 features=NULL;
00207 symbol_mask_table=NULL;
00208
00209
00210
00211
00212
00213 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00214 SG_UNREF(alphabet);
00215 alphabet=alpha;
00216 SG_REF(alphabet);
00217 }
00218
00223 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00224
00229 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00230
00235 inline CAlphabet* get_alphabet()
00236 {
00237 SG_REF(alphabet);
00238 return alphabet;
00239 }
00240
00245 virtual CFeatures* duplicate() const
00246 {
00247 return new CStringFeatures<ST>(*this);
00248 }
00249
00256 void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00257 {
00258 ASSERT(features);
00259 if (num>=num_vectors)
00260 {
00261 SG_ERROR("Index out of bounds (number of strings %d, you "
00262 "requested %d)\n", num_vectors, num);
00263 }
00264
00265 *len=features[num].length;
00266 *dst=(ST*) malloc(*len * sizeof(ST));
00267 memcpy(*dst, features[num].string, *len * sizeof(ST));
00268 }
00269
00276 void set_feature_vector(ST* src, int32_t len, int32_t num)
00277 {
00278 ASSERT(features);
00279 if (num>=num_vectors)
00280 {
00281 SG_ERROR("Index out of bounds (number of strings %d, you "
00282 "requested %d)\n", num_vectors, num);
00283 }
00284
00285 if (len<=0)
00286 SG_ERROR("String has zero or negative length\n");
00287
00288
00289 delete[] features[num].string;
00290 features[num].length=len;
00291 features[num].string=new ST[len];
00292 memcpy(features[num].string, src, len*sizeof(ST));
00293
00294 determine_maximum_string_length();
00295 }
00296
00303 virtual ST* get_feature_vector(int32_t num, int32_t& len)
00304 {
00305 ASSERT(features);
00306 ASSERT(num<num_vectors);
00307
00308 len=features[num].length;
00309 return features[num].string;
00310 }
00311
00318 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00319 {
00320 ASSERT(features && vec_num<num_vectors);
00321 ASSERT(feat_num<features[vec_num].length);
00322
00323 return features[vec_num].string[feat_num];
00324 }
00325
00331 virtual inline int32_t get_vector_length(int32_t vec_num)
00332 {
00333 ASSERT(features && vec_num<num_vectors);
00334 return features[vec_num].length;
00335 }
00336
00341 virtual inline int32_t get_max_vector_length()
00342 {
00343 return max_string_length;
00344 }
00345
00350 virtual inline int32_t get_num_vectors() { return num_vectors; }
00351
00358 inline floatmax_t get_num_symbols() { return num_symbols; }
00359
00367 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00368
00369
00370
00375 inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00376
00381 inline int32_t get_order() { return order; }
00382
00390 inline ST get_masked_symbols(ST symbol, uint8_t mask)
00391 {
00392 ASSERT(symbol_mask_table);
00393 return symbol_mask_table[mask] & symbol;
00394 }
00395
00402 inline ST shift_offset(ST offset, int32_t amount)
00403 {
00404 ASSERT(alphabet);
00405 return (offset << (amount*alphabet->get_num_bits()));
00406 }
00407
00414 inline ST shift_symbol(ST symbol, int32_t amount)
00415 {
00416 ASSERT(alphabet);
00417 return (symbol >> (amount*alphabet->get_num_bits()));
00418 }
00419
00425 virtual bool load(char* fname)
00426 {
00427 SG_INFO( "loading...\n");
00428 int64_t length=0;
00429 max_string_length=0;
00430
00431 CFile f(fname, 'r', F_CHAR);
00432 char* feature_matrix=f.load_char_data(NULL, length);
00433
00434 num_vectors=0;
00435
00436 if (f.is_ok())
00437 {
00438 for (int64_t i=0; i<length; i++)
00439 {
00440 if (feature_matrix[i]=='\n')
00441 num_vectors++;
00442 }
00443
00444 SG_INFO( "file contains %ld vectors\n", num_vectors);
00445 features= new T_STRING<ST>[num_vectors];
00446
00447 int64_t index=0;
00448 for (int32_t lines=0; lines<num_vectors; lines++)
00449 {
00450 char* p=&feature_matrix[index];
00451 int32_t columns=0;
00452
00453 for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00454
00455 if (index+columns>=length && p[columns]!='\n') {
00456 SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00457 }
00458
00459 features[lines].length=columns;
00460 features[lines].string=new ST[columns];
00461
00462 max_string_length=CMath::max(max_string_length,columns);
00463
00464 for (int32_t i=0; i<columns; i++)
00465 features[lines].string[i]= ((ST) p[i]);
00466
00467 index+= features[lines].length+1;
00468 }
00469
00470 num_symbols=4;
00471 return true;
00472 }
00473 else
00474 SG_ERROR( "reading file failed\n");
00475
00476 return false;
00477 }
00478
00485 bool load_dna_file(char* fname, bool remap_to_bin=true)
00486 {
00487 bool result=false;
00488
00489 size_t blocksize=1024*1024;
00490 size_t required_blocksize=0;
00491 uint8_t* dummy=new uint8_t[blocksize];
00492 uint8_t* overflow=NULL;
00493 int32_t overflow_len=0;
00494
00495 num_symbols=4;
00496 cleanup();
00497
00498 CAlphabet* alpha=new CAlphabet(DNA);
00499 CAlphabet* alpha_bin=new CAlphabet(RAWDNA);
00500
00501 FILE* f=fopen(fname, "ro");
00502
00503 if (f)
00504 {
00505 num_vectors=0;
00506 max_string_length=0;
00507
00508 SG_INFO("counting line numbers in file %s\n", fname);
00509 size_t block_offs=0;
00510 size_t old_block_offs=0;
00511 fseek(f, 0, SEEK_END);
00512 size_t fsize=ftell(f);
00513 rewind(f);
00514
00515 if (blocksize>fsize)
00516 blocksize=fsize;
00517
00518 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00519
00520 size_t sz=blocksize;
00521 while (sz == blocksize)
00522 {
00523 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00524 bool contains_cr=false;
00525 for (size_t i=0; i<sz; i++)
00526 {
00527 block_offs++;
00528 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00529 {
00530 num_vectors++;
00531 contains_cr=true;
00532 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00533 old_block_offs=block_offs;
00534 }
00535 }
00536 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00537 }
00538
00539 SG_INFO("found %d strings\n", num_vectors);
00540 delete[] dummy;
00541 blocksize=required_blocksize;
00542 dummy = new uint8_t[blocksize];
00543 overflow = new uint8_t[blocksize];
00544 features=new T_STRING<ST>[num_vectors];
00545
00546 rewind(f);
00547 sz=blocksize;
00548 int32_t lines=0;
00549 while (sz == blocksize)
00550 {
00551 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00552
00553 size_t old_sz=0;
00554 for (size_t i=0; i<sz; i++)
00555 {
00556 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00557 {
00558 int32_t len=i-old_sz;
00559
00560 max_string_length=CMath::max(max_string_length, len+overflow_len);
00561
00562 features[lines].length=len;
00563 features[lines].string=new ST[len];
00564
00565 if (remap_to_bin)
00566 {
00567 for (int32_t j=0; j<overflow_len; j++)
00568 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00569 for (int32_t j=0; j<len; j++)
00570 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00571 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00572 }
00573 else
00574 {
00575 for (int32_t j=0; j<overflow_len; j++)
00576 features[lines].string[j]=overflow[j];
00577 for (int32_t j=0; j<len; j++)
00578 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00579 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00580 }
00581
00582
00583 overflow_len=0;
00584
00585
00586 old_sz=i+1;
00587 lines++;
00588 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00589 }
00590 }
00591 for (size_t i=old_sz; i<sz; i++)
00592 overflow[i-old_sz]=dummy[i];
00593
00594 overflow_len=sz-old_sz;
00595 }
00596 result=true;
00597 SG_INFO("file successfully read\n");
00598 SG_INFO("max_string_length=%d\n", max_string_length);
00599 SG_INFO("num_strings=%d\n", num_vectors);
00600 }
00601
00602 fclose(f);
00603 delete[] dummy;
00604
00605 SG_UNREF(alphabet);
00606
00607 if (remap_to_bin)
00608 alphabet = alpha_bin;
00609 else
00610 alphabet = alpha;
00611 SG_REF(alphabet);
00612
00613 return result;
00614 }
00615
00622 bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00623 {
00624 int32_t i=0;
00625 uint64_t len=0;
00626 uint64_t offs=0;
00627 int32_t num=0;
00628 int32_t max_len=0;
00629
00630 CMemoryMappedFile<char> f(fname);
00631
00632 while (true)
00633 {
00634 char* s=f.get_line(len, offs);
00635 if (!s)
00636 break;
00637
00638 if (len>0 && s[0]=='>')
00639 num++;
00640 }
00641
00642 if (num==0)
00643 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00644
00645 cleanup();
00646 SG_UNREF(alphabet);
00647 alphabet=new CAlphabet(DNA);
00648
00649 T_STRING<ST>* strings=new T_STRING<ST>[num];
00650 offs=0;
00651
00652 for (i=0;i<num; i++)
00653 {
00654 uint64_t id_len=0;
00655 char* id=f.get_line(id_len, offs);
00656
00657 char* fasta=f.get_line(len, offs);
00658 char* s=fasta;
00659 int32_t fasta_len=0;
00660 int32_t spanned_lines=0;
00661
00662 while (true)
00663 {
00664 if (!s || len==0)
00665 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00666
00667 if (s[0]=='>' || offs==f.get_size())
00668 {
00669 offs-=len+1;
00670 if (offs==f.get_size())
00671 {
00672 SG_DEBUG("at EOF\n");
00673 fasta_len+=len;
00674 }
00675
00676 len = fasta_len-spanned_lines;
00677 strings[i].string=new ST[len];
00678 strings[i].length=len;
00679
00680 ST* str=strings[i].string;
00681 int32_t idx=0;
00682 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00683
00684 for (int32_t j=0; j<fasta_len; j++)
00685 {
00686 if (fasta[j]=='\n')
00687 continue;
00688
00689 ST c = (ST) fasta[j];
00690
00691 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00692 c = (ST) 'A';
00693
00694 if (idx>=len)
00695 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00696 str[idx++]=c;
00697 }
00698 max_len=CMath::max(max_len, strings[i].length);
00699
00700
00701 break;
00702 }
00703
00704 spanned_lines++;
00705 fasta_len+=len+1;
00706 s=f.get_line(len, offs);
00707 }
00708 }
00709
00710 return set_features(strings, num, max_len);
00711 }
00712
00720 bool load_fastq_file(const char* fname,
00721 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00722 {
00723 CMemoryMappedFile<char> f(fname);
00724
00725 int32_t i=0;
00726 uint64_t len=0;
00727 uint64_t offs=0;
00728
00729 int32_t num=f.get_num_lines();
00730 int32_t max_len=0;
00731
00732 if (num%4)
00733 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00734 num/=4;
00735
00736 cleanup();
00737 SG_UNREF(alphabet);
00738 alphabet=new CAlphabet(DNA);
00739
00740 T_STRING<ST>* strings;
00741
00742 ST* str;
00743 if (bitremap_in_single_string)
00744 {
00745 strings=new T_STRING<ST>[1];
00746 strings[0].string=new ST[num];
00747 strings[0].length=num;
00748 f.get_line(len, offs);
00749 f.get_line(len, offs);
00750 order=len;
00751 max_len=num;
00752 offs=0;
00753 original_num_symbols=alphabet->get_num_symbols();
00754 int32_t max_val=alphabet->get_num_bits();
00755 str=new ST[len];
00756 }
00757 else
00758 strings=new T_STRING<ST>[num];
00759
00760 for (i=0;i<num; i++)
00761 {
00762 if (!f.get_line(len, offs))
00763 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00764
00765 char* s=f.get_line(len, offs);
00766 if (!s || len==0)
00767 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00768
00769 if (bitremap_in_single_string)
00770 {
00771 if (len!=order)
00772 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00773 for (int32_t j=0; j<order; j++)
00774 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00775
00776 strings[0].string[i]=embed_word(str, order);
00777 }
00778 else
00779 {
00780 strings[i].string=new ST[len];
00781 strings[i].length=len;
00782 str=strings[i].string;
00783
00784 if (ignore_invalid)
00785 {
00786 for (int32_t j=0; j<len; j++)
00787 {
00788 if (alphabet->is_valid((uint8_t) s[j]))
00789 str[j]= (ST) s[j];
00790 else
00791 str[j]= (ST) 'A';
00792 }
00793 }
00794 else
00795 {
00796 for (int32_t j=0; j<len; j++)
00797 str[j]= (ST) s[j];
00798 }
00799 max_len=CMath::max(max_len, (int32_t) len);
00800 }
00801
00802
00803 if (!f.get_line(len, offs))
00804 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00805
00806 if (!f.get_line(len, offs))
00807 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00808 }
00809
00810 if (bitremap_in_single_string)
00811 num=1;
00812
00813 num_vectors=num;
00814 max_string_length=max_len;
00815 features=strings;
00816
00817 return true;
00818 }
00819
00825 bool load_from_directory(char* dirname)
00826 {
00827 struct dirent **namelist;
00828 int32_t n;
00829
00830 CIO::set_dirname(dirname);
00831
00832 SG_DEBUG("dirname '%s'\n", dirname);
00833
00834 n = scandir(dirname, &namelist, &CIO::filter, alphasort);
00835 if (n <= 0)
00836 {
00837 SG_ERROR("error calling scandir - no files found\n");
00838 return false;
00839 }
00840 else
00841 {
00842 T_STRING<ST>* strings=NULL;
00843
00844 int32_t num=0;
00845 int32_t max_len=-1;
00846
00847
00848
00849 strings=new T_STRING<ST>[n];
00850
00851 for (int32_t i=0; i<n; i++)
00852 {
00853 char* fname=CIO::concat_filename(namelist[i]->d_name);
00854
00855 struct stat s;
00856 off_t filesize=0;
00857
00858 if (!stat(fname, &s) && s.st_size>0)
00859 {
00860 filesize=s.st_size/sizeof(ST);
00861
00862 FILE* f=fopen(fname, "ro");
00863 if (f)
00864 {
00865 ST* str=new ST[filesize];
00866 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00867 fread(str, sizeof(ST), filesize, f);
00868 strings[num].string=str;
00869 strings[num].length=filesize;
00870 max_len=CMath::max(max_len, strings[num].length);
00871
00872 num++;
00873 fclose(f);
00874 }
00875 }
00876 else
00877 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00878
00879 free(namelist[i]);
00880 }
00881 free(namelist);
00882
00883 if (num>0 && strings)
00884 {
00885 set_features(strings, num, max_len);
00886 return true;
00887 }
00888 }
00889 return false;
00890 }
00891
00899 bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00900 {
00901 if (p_features)
00902 {
00903 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00904
00905
00906 for (int32_t i=0; i<p_num_vectors; i++)
00907 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
00908
00909 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00910 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00911
00912 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00913 {
00914 cleanup();
00915 SG_UNREF(alphabet);
00916
00917 alphabet=alpha;
00918 SG_REF(alphabet);
00919
00920 this->features=p_features;
00921 this->num_vectors=p_num_vectors;
00922 this->max_string_length=p_max_string_length;
00923
00924 return true;
00925 }
00926 else
00927 SG_UNREF(alpha);
00928 }
00929
00930 return false;
00931 }
00932
00939 virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
00940 {
00941 num_str=num_vectors;
00942 max_str_len=max_string_length;
00943 return features;
00944 }
00945
00951 virtual void get_features(T_STRING<ST>** dst, int32_t* num_str)
00952 {
00953 *num_str=num_vectors;
00954 *dst=features;
00955 }
00956
00962 virtual bool save(char* dest)
00963 {
00964 return false;
00965 }
00966
00971 virtual int32_t get_size() { return sizeof(ST); }
00972
00978 virtual bool apply_preproc(bool force_preprocessing=false)
00979 {
00980 SG_DEBUG( "force: %d\n", force_preprocessing);
00981
00982 for (int32_t i=0; i<get_num_preproc(); i++)
00983 {
00984 if ( (!is_preprocessed(i) || force_preprocessing) )
00985 {
00986 set_preprocessed(i);
00987 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00988 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
00989
00990 if (!p->apply_to_string_features(this))
00991 {
00992 SG_UNREF(p);
00993 return false;
00994 }
00995 else
00996 SG_UNREF(p);
00997 }
00998 }
00999 return true;
01000 }
01001
01011 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01012 {
01013 ASSERT(step_size>0);
01014 ASSERT(window_size>0);
01015 ASSERT(num_vectors==1 || single_string);
01016 ASSERT(max_string_length>=window_size ||
01017 (single_string && length_of_single_string>=window_size));
01018
01019
01020
01021 if (single_string)
01022 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01023 else if (num_vectors==1)
01024 {
01025 num_vectors= (max_string_length-window_size)/step_size + 1;
01026 length_of_single_string=max_string_length;
01027 }
01028
01029 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01030 int32_t offs=0;
01031 for (int32_t i=0; i<num_vectors; i++)
01032 {
01033 f[i].string=&features[0].string[offs+skip];
01034 f[i].length=window_size-skip;
01035 offs+=step_size;
01036 }
01037 single_string=features[0].string;
01038 delete[] features;
01039 features=f;
01040 max_string_length=window_size-skip;
01041
01042 return num_vectors;
01043 }
01044
01053 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01054 {
01055 ASSERT(positions);
01056 ASSERT(window_size>0);
01057 ASSERT(num_vectors==1 || single_string);
01058 ASSERT(max_string_length>=window_size ||
01059 (single_string && length_of_single_string>=window_size));
01060
01061 num_vectors= positions->get_num_elements();
01062 ASSERT(num_vectors>0);
01063
01064 int32_t len;
01065
01066
01067
01068 if (single_string)
01069 len=length_of_single_string;
01070 else
01071 {
01072 single_string=features[0].string;
01073 len=max_string_length;
01074 length_of_single_string=max_string_length;
01075 }
01076
01077 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01078 for (int32_t i=0; i<num_vectors; i++)
01079 {
01080 int32_t p=positions->get_element(i);
01081
01082 if (p>=0 && p<=len-window_size)
01083 {
01084 f[i].string=&features[0].string[p+skip];
01085 f[i].length=window_size-skip;
01086 }
01087 else
01088 {
01089 num_vectors=1;
01090 max_string_length=len;
01091 features[0].length=len;
01092 single_string=NULL;
01093 delete[] f;
01094 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01095 window_size, i, p, len);
01096 return -1;
01097 }
01098 }
01099
01100 delete[] features;
01101 features=f;
01102 max_string_length=window_size-skip;
01103
01104 return num_vectors;
01105 }
01106
01118 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01119 {
01120 return obtain_from_char_features(sf, start, p_order, gap, rev);
01121 }
01122
01132 template <class CT>
01133 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01134 {
01135 ASSERT(sf);
01136
01137 CAlphabet* alpha=sf->get_alphabet();
01138 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01139
01140 this->order=p_order;
01141 cleanup();
01142
01143 num_vectors=sf->get_num_vectors();
01144 ASSERT(num_vectors>0);
01145 max_string_length=sf->get_max_vector_length()-start;
01146 features=new T_STRING<ST>[num_vectors];
01147
01148 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01149 alpha->get_num_symbols_in_histogram());
01150
01151 for (int32_t i=0; i<num_vectors; i++)
01152 {
01153 int32_t len=-1;
01154 CT* c=sf->get_feature_vector(i, len);
01155
01156 features[i].string=new ST[len];
01157 features[i].length=len;
01158
01159 ST* str=features[i].string;
01160 for (int32_t j=0; j<len; j++)
01161 str[j]=(ST) alpha->remap_to_bin(c[j]);
01162
01163 }
01164
01165 original_num_symbols=alpha->get_num_symbols();
01166 int32_t max_val=alpha->get_num_bits();
01167
01168 SG_UNREF(alpha);
01169
01170 if (p_order>1)
01171 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01172 else
01173 num_symbols=original_num_symbols;
01174 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01175
01176 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01177 {
01178 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01179 return false;
01180 }
01181
01182 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01183 for (int32_t line=0; line<num_vectors; line++)
01184 {
01185 int32_t len=0;
01186 ST* fv=get_feature_vector(line, len);
01187
01188 if (rev)
01189 translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01190 else
01191 translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01192
01193
01194 features[line].length-=start+gap ;
01195 if (features[line].length<0)
01196 features[line].length=0 ;
01197 }
01198
01199 compute_symbol_mask_table(max_val);
01200
01201 return true;
01202 }
01203
01211 bool have_same_length(int32_t len=-1)
01212 {
01213 if (len!=-1)
01214 {
01215 if (len!=get_max_vector_length())
01216 return false;
01217 }
01218 len = get_max_vector_length();
01219
01220 for (int32_t i=0; i<num_vectors; i++)
01221 {
01222 if (get_vector_length(i)!=len)
01223 return false;
01224 }
01225
01226 return true;
01227 }
01228
01233 inline void embed_features(int32_t p_order)
01234 {
01235 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01236
01237 order=p_order;
01238 original_num_symbols=alphabet->get_num_symbols();
01239 int32_t max_val=alphabet->get_num_bits();
01240
01241 if (p_order>1)
01242 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01243 else
01244 num_symbols=original_num_symbols;
01245
01246 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01247
01248 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01249 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01250
01251 ST mask=0;
01252 for (int32_t i=0; i<p_order*max_val; i++)
01253 mask= (mask<<1) | ((ST) 1);
01254
01255 for (int32_t i=0; i<num_vectors; i++)
01256 {
01257 int32_t len=features[i].length;
01258
01259 if (len < p_order)
01260 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01261
01262 ST* str = features[i].string;
01263
01264
01265 for (int32_t j=0; j<p_order; j++)
01266 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01267 str[0]=embed_word(&str[0], p_order);
01268
01269
01270 int32_t idx=0;
01271 for (int32_t j=p_order; j<len; j++)
01272 {
01273 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01274 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01275 idx++;
01276 }
01277
01278 features[i].length=len-p_order+1;
01279 }
01280
01281 compute_symbol_mask_table(max_val);
01282 }
01283
01288 inline void compute_symbol_mask_table(int64_t max_val)
01289 {
01290 delete[] symbol_mask_table;
01291 symbol_mask_table=new ST[256];
01292
01293 uint64_t mask=0;
01294 for (int32_t i=0; i< (int64_t) max_val; i++)
01295 mask=(mask<<1) | 1;
01296
01297 for (int32_t i=0; i<256; i++)
01298 {
01299 uint8_t bits=(uint8_t) i;
01300 symbol_mask_table[i]=0;
01301
01302 for (int32_t j=0; j<8; j++)
01303 {
01304 if (bits & 1)
01305 symbol_mask_table[i]|=mask<<(max_val*j);
01306
01307 bits>>=1;
01308 }
01309 }
01310 }
01311
01318 inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01319 {
01320 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01321
01322 ST mask=0;
01323 for (int32_t i=0; i<nbits; i++)
01324 mask=(mask<<1) | (ST) 1;
01325
01326 for (int32_t i=0; i<len; i++)
01327 {
01328 ST w=(word & mask);
01329 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01330 word>>=nbits;
01331 }
01332 }
01333
01339 inline ST embed_word(ST* seq, int32_t len)
01340 {
01341 ST value=(ST) 0;
01342 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01343 for (int32_t i=0; i<len; i++)
01344 {
01345 value<<=nbits;
01346 value|=seq[i];
01347 }
01348
01349 return value;
01350 }
01351
01354 void determine_maximum_string_length()
01355 {
01356 max_string_length=0;
01357
01358 for (int32_t i=0; i<num_vectors; i++)
01359 max_string_length=CMath::max(max_string_length, features[i].length);
01360 }
01361
01363 inline virtual const char* get_name() const { return "StringFeatures"; }
01364
01365 protected:
01366
01375 void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
01376 {
01377 int32_t i,j;
01378 ST value=0;
01379
01380 for (i=sequence_length-1; i>= p_order-1; i--)
01381 {
01382 value=0;
01383 for (j=i; j>=i-p_order+1; j--)
01384 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
01385
01386 obs[i]= (ST) value;
01387 }
01388
01389 for (i=p_order-2;i>=0;i--)
01390 {
01391 if (i>=sequence_length)
01392 continue;
01393
01394 value=0;
01395 for (j=i; j>=i-p_order+1; j--)
01396 {
01397 value= (value >> max_val);
01398 if (j>=0 && j<sequence_length)
01399 value|=obs[j] << (max_val * (p_order-1));
01400 }
01401 obs[i]=value;
01402 }
01403
01404
01405 for (i=start; i<sequence_length; i++)
01406 obs[i-start]=obs[i];
01407 }
01408
01417 void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
01418 {
01419 int32_t i,j;
01420 ST value=0;
01421
01422 for (i=sequence_length-1; i>= p_order-1; i--)
01423 {
01424 value=0;
01425 for (j=i; j>=i-p_order+1; j--)
01426 value= (value << max_val) | obs[j];
01427
01428 obs[i]= (ST) value;
01429 }
01430
01431 for (i=p_order-2;i>=0;i--)
01432 {
01433 if (i>=sequence_length)
01434 continue;
01435
01436 value=0;
01437 for (j=i; j>=i-p_order+1; j--)
01438 {
01439 value= (value << max_val);
01440 if (j>=0 && j<sequence_length)
01441 value|=obs[j];
01442 }
01443 obs[i]=value;
01444 }
01445
01446
01447 for (i=start; i<sequence_length; i++)
01448 obs[i-start]=obs[i];
01449 }
01450
01460 void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01461 {
01462 ASSERT(gap>=0);
01463
01464 const int32_t start_gap=(p_order-gap)/2;
01465 const int32_t end_gap=start_gap+gap;
01466
01467 int32_t i,j;
01468 ST value=0;
01469
01470
01471 for (i=sequence_length-1; i>=p_order-1; i--)
01472 {
01473 value=0;
01474 for (j=i; j>=i-p_order+1; j--)
01475 {
01476 if (i-j<start_gap)
01477 {
01478 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01479 }
01480 else if (i-j>=end_gap)
01481 {
01482 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01483 }
01484 }
01485 obs[i]= (ST) value;
01486 }
01487
01488
01489 for (i=p_order-2;i>=0;i--)
01490 {
01491 if (i>=sequence_length)
01492 continue;
01493
01494 value=0;
01495 for (j=i; j>=i-p_order+1; j--)
01496 {
01497 if (i-j<start_gap)
01498 {
01499 value= (value >> max_val);
01500 if (j>=0 && j<sequence_length)
01501 value|=obs[j] << (max_val * (p_order-1-gap));
01502 }
01503 else if (i-j>=end_gap)
01504 {
01505 value= (value >> max_val);
01506 if (j>=0 && j<sequence_length)
01507 value|=obs[j] << (max_val * (p_order-1-gap));
01508 }
01509 }
01510 obs[i]=value;
01511 }
01512
01513
01514 for (i=start; i<sequence_length; i++)
01515 obs[i-start]=obs[i];
01516 }
01517
01527 void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01528 {
01529 ASSERT(gap>=0);
01530
01531 const int32_t start_gap=(p_order-gap)/2;
01532 const int32_t end_gap=start_gap+gap;
01533
01534 int32_t i,j;
01535 ST value=0;
01536
01537
01538 for (i=sequence_length-1; i>=p_order-1; i--)
01539 {
01540 value=0;
01541 for (j=i; j>=i-p_order+1; j--)
01542 {
01543 if (i-j<start_gap)
01544 value= (value << max_val) | obs[j];
01545 else if (i-j>=end_gap)
01546 value= (value << max_val) | obs[j];
01547 }
01548 obs[i]= (ST) value;
01549 }
01550
01551
01552 for (i=p_order-2;i>=0;i--)
01553 {
01554 if (i>=sequence_length)
01555 continue;
01556
01557 value=0;
01558 for (j=i; j>=i-p_order+1; j--)
01559 {
01560 if (i-j<start_gap)
01561 {
01562 value= value << max_val;
01563 if (j>=0 && j<sequence_length)
01564 value|=obs[j];
01565 }
01566 else if (i-j>=end_gap)
01567 {
01568 value= value << max_val;
01569 if (j>=0 && j<sequence_length)
01570 value|=obs[j];
01571 }
01572 }
01573 obs[i]=value;
01574 }
01575
01576
01577 for (i=start; i<sequence_length; i++)
01578 obs[i-start]=obs[i];
01579 }
01580
01581 protected:
01582
01589 virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01590 {
01591 ASSERT(features);
01592 ASSERT(num<num_vectors);
01593
01594 features[num].length=len ;
01595 features[num].string=string ;
01596 }
01597
01598
01599 protected:
01600
01602 CAlphabet* alphabet;
01603
01605 int32_t num_vectors;
01606
01608 T_STRING<ST>* features;
01609
01611 ST* single_string;
01612
01614 int32_t length_of_single_string;
01615
01617 int32_t max_string_length;
01618
01620 floatmax_t num_symbols;
01621
01623 floatmax_t original_num_symbols;
01624
01626 int32_t order;
01627
01629 ST* symbol_mask_table;
01630 };
01631
01632 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01633
01637 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
01638 {
01639 return F_BOOL;
01640 }
01641
01646 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
01647 {
01648 return F_CHAR;
01649 }
01650
01655 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01656 {
01657 return F_BYTE;
01658 }
01659
01664 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
01665 {
01666 return F_SHORT;
01667 }
01668
01673 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01674 {
01675 return F_WORD;
01676 }
01677
01682 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
01683 {
01684 return F_INT;
01685 }
01686
01691 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01692 {
01693 return F_UINT;
01694 }
01695
01700 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
01701 {
01702 return F_LONG;
01703 }
01704
01709 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01710 {
01711 return F_ULONG;
01712 }
01713
01718 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
01719 {
01720 return F_SHORTREAL;
01721 }
01722
01727 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
01728 {
01729 return F_DREAL;
01730 }
01731
01736 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
01737 {
01738 return F_LONGREAL;
01739 }
01740
01741 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01742 {
01743 return symbol;
01744 }
01745 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01746 {
01747 return symbol;
01748 }
01749 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01750 {
01751 return symbol;
01752 }
01753 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01754 {
01755 return symbol;
01756 }
01757
01758 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01759 {
01760 return false;
01761 }
01762 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01763 {
01764 return 0;
01765 }
01766 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01767 {
01768 return 0;
01769 }
01770 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01771 {
01772 return 0;
01773 }
01774
01775 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01776 {
01777 return symbol;
01778 }
01779 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01780 {
01781 return symbol;
01782 }
01783 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01784 {
01785 return symbol;
01786 }
01787 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01788 {
01789 return symbol;
01790 }
01791
01792 template<> inline void CStringFeatures<float32_t>::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01793 {
01794 }
01795
01796 template<> inline void CStringFeatures<float64_t>::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01797 {
01798 }
01799
01800 template<> inline void CStringFeatures<floatmax_t>::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01801 {
01802 }
01803
01804 template<> inline void CStringFeatures<float32_t>::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01805 {
01806 }
01807
01808 template<> inline void CStringFeatures<float64_t>::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01809 {
01810 }
01811
01812 template<> inline void CStringFeatures<floatmax_t>::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01813 {
01814 }
01815
01816 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01817 {
01818 return false;
01819 }
01820 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01821 {
01822 return false;
01823 }
01824 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01825 {
01826 return false;
01827 }
01828
01829 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01830 {
01831 }
01832 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01833 {
01834 }
01835 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01836 {
01837 }
01838
01839 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01840 {
01841 }
01842 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01843 {
01844 }
01845 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01846 {
01847 }
01848
01849 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01850 {
01851 return 0;
01852 }
01853 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01854 {
01855 return 0;
01856 }
01857 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01858 {
01859 return 0;
01860 }
01861
01862 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01863 {
01864 }
01865 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01866 {
01867 }
01868 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01869 {
01870 }
01871 #endif // DOXYGEN_SHOULD_SKIP_THIS
01872 #endif // _CSTRINGFEATURES__H__