Template class StringFeatures implements a list of strings.
As this class is template the underlying storage type is quite arbitrary and not limited to character strings, but could also be sequences of floating point numbers etc. Strings differ from matrices (cf. CSimpleFeatures) in a way that the dimensionality of the feature vectors (i.e. the strings) is not fixed; it may vary between strings.
Most string kernels require StringFeatures but a number of them actually requires strings to have same length.
Note: StringFeatures do not support PreProcs
Definition at line 71 of file StringFeatures.h.
Public Member Functions | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, EAlphabet alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (char *fname, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
void | cleanup () |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
void | get_feature_vector (ST **dst, int32_t *len, int32_t num) |
void | set_feature_vector (ST *src, int32_t len, int32_t num) |
virtual ST * | get_feature_vector (int32_t num, int32_t &len) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual bool | load (char *fname) |
bool | load_dna_file (char *fname, bool remap_to_bin=true) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
bool | set_features (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
virtual T_STRING< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (T_STRING< ST > **dst, int32_t *num_str) |
virtual bool | save (char *dest) |
virtual int32_t | get_size () |
virtual bool | apply_preproc (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual const char * | get_name () const |
Protected Member Functions | |
void | translate_from_single_order (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) |
void | translate_from_single_order_reversed (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) |
void | translate_from_single_order (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) |
void | translate_from_single_order_reversed (ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
Protected Attributes | |
CAlphabet * | alphabet |
alphabet | |
int32_t | num_vectors |
number of string vectors | |
T_STRING< ST > * | features |
this contains the array of features. | |
ST * | single_string |
true when single string / created by sliding window | |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
length of longest string | |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping |
CStringFeatures< ST >::CStringFeatures | ( | EAlphabet | alpha | ) |
constructor
alpha | alphabet (type) to use for string features |
Definition at line 78 of file StringFeatures.h.
CStringFeatures< ST >::CStringFeatures | ( | T_STRING< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length, | |||
EAlphabet | alpha | |||
) |
constructor
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length | |
alpha | alphabet (type) to use for string features |
Definition at line 96 of file StringFeatures.h.
CStringFeatures< ST >::CStringFeatures | ( | CAlphabet * | alpha | ) |
constructor
alpha | alphabet to use for string features |
Definition at line 113 of file StringFeatures.h.
CStringFeatures< ST >::CStringFeatures | ( | const CStringFeatures< ST > & | orig | ) |
copy constructor
Definition at line 126 of file StringFeatures.h.
CStringFeatures< ST >::CStringFeatures | ( | char * | fname, | |
EAlphabet | alpha = DNA | |||
) |
constructor
fname | filename to load features from | |
alpha | alphabet (type) to use for string features |
Definition at line 166 of file StringFeatures.h.
virtual CStringFeatures< ST >::~CStringFeatures | ( | ) | [virtual] |
Definition at line 179 of file StringFeatures.h.
virtual bool CStringFeatures< ST >::apply_preproc | ( | bool | force_preprocessing = false |
) | [virtual] |
apply preprocessor
force_preprocessing | if preprocssing shall be forced |
Definition at line 978 of file StringFeatures.h.
void CStringFeatures< ST >::cleanup | ( | ) |
cleanup string features
Definition at line 187 of file StringFeatures.h.
void CStringFeatures< ST >::compute_symbol_mask_table | ( | int64_t | max_val | ) |
compute symbol mask table
required to access bit-based symbols
Definition at line 1288 of file StringFeatures.h.
void CStringFeatures< ST >::determine_maximum_string_length | ( | ) |
determine new maximum string length
Definition at line 1354 of file StringFeatures.h.
virtual CFeatures* CStringFeatures< ST >::duplicate | ( | ) | const [virtual] |
duplicate feature object
Implements CFeatures.
Definition at line 245 of file StringFeatures.h.
void CStringFeatures< ST >::embed_features | ( | int32_t | p_order | ) |
embed string features in bit representation in-place
Definition at line 1233 of file StringFeatures.h.
ST CStringFeatures< ST >::embed_word | ( | ST * | seq, | |
int32_t | len | |||
) |
embed a single word
seq | sequence of size len in a bitfield | |
len |
Definition at line 1339 of file StringFeatures.h.
CAlphabet* CStringFeatures< ST >::get_alphabet | ( | ) |
get alphabet used in string features
Definition at line 235 of file StringFeatures.h.
virtual ST CStringFeatures< ST >::get_feature | ( | int32_t | vec_num, | |
int32_t | feat_num | |||
) | [virtual] |
get feature
vec_num | which vector | |
feat_num | which feature |
Definition at line 318 of file StringFeatures.h.
virtual EFeatureClass CStringFeatures< ST >::get_feature_class | ( | ) | [virtual] |
get feature class
Implements CFeatures.
Definition at line 223 of file StringFeatures.h.
virtual EFeatureType CStringFeatures< ST >::get_feature_type | ( | ) | [virtual] |
get feature type
Implements CFeatures.
Definition at line 229 of file StringFeatures.h.
virtual ST* CStringFeatures< ST >::get_feature_vector | ( | int32_t | num, | |
int32_t & | len | |||
) | [virtual] |
get feature vector for sample num
num | index of feature vector | |
len | length is returned by reference |
Definition at line 303 of file StringFeatures.h.
void CStringFeatures< ST >::get_feature_vector | ( | ST ** | dst, | |
int32_t * | len, | |||
int32_t | num | |||
) |
get string for selected example num
dst | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
Definition at line 256 of file StringFeatures.h.
virtual void CStringFeatures< ST >::get_features | ( | T_STRING< ST > ** | dst, | |
int32_t * | num_str | |||
) | [virtual] |
get_features (swig compatible)
dst | string features (returned) | |
num_str | number of strings (returned) |
Definition at line 951 of file StringFeatures.h.
virtual T_STRING<ST>* CStringFeatures< ST >::get_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
get_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 939 of file StringFeatures.h.
ST CStringFeatures< ST >::get_masked_symbols | ( | ST | symbol, | |
uint8_t | mask | |||
) |
a higher order mapped symbol will be shaped such that the symbols specified by bits in the mask will be returned.
symbol | symbol to mask | |
mask | mask to apply |
Definition at line 390 of file StringFeatures.h.
floatmax_t CStringFeatures< ST >::get_max_num_symbols | ( | ) |
get maximum number of symbols
Note: floatmax_t sounds weird, but int64_t is not long enough (and there is no int128_t type)
Definition at line 367 of file StringFeatures.h.
virtual int32_t CStringFeatures< ST >::get_max_vector_length | ( | ) | [virtual] |
get maximum vector length
Definition at line 341 of file StringFeatures.h.
virtual const char* CStringFeatures< ST >::get_name | ( | ) | const [virtual] |
floatmax_t CStringFeatures< ST >::get_num_symbols | ( | ) |
get number of symbols
Note: floatmax_t sounds weird, but LONG is not long enough
Definition at line 358 of file StringFeatures.h.
virtual int32_t CStringFeatures< ST >::get_num_vectors | ( | ) | [virtual] |
get number of vectors
Implements CFeatures.
Definition at line 350 of file StringFeatures.h.
int32_t CStringFeatures< ST >::get_order | ( | ) |
floatmax_t CStringFeatures< ST >::get_original_num_symbols | ( | ) |
number of symbols before higher order mapping
Definition at line 375 of file StringFeatures.h.
virtual int32_t CStringFeatures< ST >::get_size | ( | ) | [virtual] |
get memory footprint of one feature
Implements CFeatures.
Definition at line 971 of file StringFeatures.h.
virtual int32_t CStringFeatures< ST >::get_vector_length | ( | int32_t | vec_num | ) | [virtual] |
get vector length
vec_num | which vector |
Definition at line 331 of file StringFeatures.h.
bool CStringFeatures< ST >::have_same_length | ( | int32_t | len = -1 |
) |
check if length of each vector in this feature object equals the given length.
len | vector length to check against |
Definition at line 1211 of file StringFeatures.h.
virtual bool CStringFeatures< ST >::load | ( | char * | fname | ) | [virtual] |
load features from file
fname | filename to load from |
Reimplemented from CFeatures.
Definition at line 425 of file StringFeatures.h.
bool CStringFeatures< ST >::load_dna_file | ( | char * | fname, | |
bool | remap_to_bin = true | |||
) |
load DNA features from file
fname | filename to load from | |
remap_to_bin | if remap_to_bin |
Definition at line 485 of file StringFeatures.h.
bool CStringFeatures< ST >::load_fasta_file | ( | const char * | fname, | |
bool | ignore_invalid = false | |||
) |
load fasta file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
Definition at line 622 of file StringFeatures.h.
bool CStringFeatures< ST >::load_fastq_file | ( | const char * | fname, | |
bool | ignore_invalid = false , |
|||
bool | bitremap_in_single_string = false | |||
) |
load fastq file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A | |
bitremap_in_single_string | if set to true, do binary embedding of symbols |
Definition at line 720 of file StringFeatures.h.
bool CStringFeatures< ST >::load_from_directory | ( | char * | dirname | ) |
load features from directory
dirname | directory name to load from |
Definition at line 825 of file StringFeatures.h.
int32_t CStringFeatures< ST >::obtain_by_position_list | ( | int32_t | window_size, | |
CDynamicArray< int32_t > * | positions, | |||
int32_t | skip = 0 | |||
) |
extracts windows of size window_size from first string using the positions in list
window_size | window size | |
positions | positions | |
skip | skip |
Definition at line 1053 of file StringFeatures.h.
int32_t CStringFeatures< ST >::obtain_by_sliding_window | ( | int32_t | window_size, | |
int32_t | step_size, | |||
int32_t | skip = 0 | |||
) |
slides a window of size window_size over the current single string step_size is the amount by which the window is shifted. creates (string_len-window_size)/step_size many feature obj if skip is nonzero, skip the first 'skip' characters of each string
window_size | window size | |
step_size | step size | |
skip | skip |
Definition at line 1011 of file StringFeatures.h.
bool CStringFeatures< ST >::obtain_from_char | ( | CStringFeatures< char > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
obtain string features from char features
wrapper for template method
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1118 of file StringFeatures.h.
bool CStringFeatures< ST >::obtain_from_char_features | ( | CStringFeatures< CT > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
template obtain from char features
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1133 of file StringFeatures.h.
virtual bool CStringFeatures< ST >::save | ( | char * | dest | ) | [virtual] |
save features to file
dest | filename to save to |
Reimplemented from CFeatures.
Definition at line 962 of file StringFeatures.h.
virtual void CStringFeatures< ST >::set_feature_vector | ( | int32_t | num, | |
ST * | string, | |||
int32_t | len | |||
) | [protected, virtual] |
set feature vector for sample num
num | index of feature vector | |
string | string with the feature vector's content | |
len | length of the string |
Definition at line 1589 of file StringFeatures.h.
void CStringFeatures< ST >::set_feature_vector | ( | ST * | src, | |
int32_t | len, | |||
int32_t | num | |||
) |
set string for selected example num
src | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
Definition at line 276 of file StringFeatures.h.
bool CStringFeatures< ST >::set_features | ( | T_STRING< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
set features
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
Definition at line 899 of file StringFeatures.h.
ST CStringFeatures< ST >::shift_offset | ( | ST | offset, | |
int32_t | amount | |||
) |
shift offset to the left by amount
offset | offset to shift | |
amount | amount to shift the offset |
Definition at line 402 of file StringFeatures.h.
ST CStringFeatures< ST >::shift_symbol | ( | ST | symbol, | |
int32_t | amount | |||
) |
shift symbol to the right by amount (taking care of custom symbol sizes)
symbol | symbol to shift | |
amount | amount to shift the symbol |
Definition at line 414 of file StringFeatures.h.
void CStringFeatures< ST >::translate_from_single_order | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val, | |||
int32_t | gap | |||
) | [protected] |
translate from single order
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value | |
gap | gap |
Definition at line 1460 of file StringFeatures.h.
void CStringFeatures< ST >::translate_from_single_order | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val | |||
) | [protected] |
translate from single order
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value |
Definition at line 1375 of file StringFeatures.h.
void CStringFeatures< ST >::translate_from_single_order_reversed | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val, | |||
int32_t | gap | |||
) | [protected] |
translate from single order reversed
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value | |
gap | gap |
Definition at line 1527 of file StringFeatures.h.
void CStringFeatures< ST >::translate_from_single_order_reversed | ( | ST * | obs, | |
int32_t | sequence_length, | |||
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | max_val | |||
) | [protected] |
translate from single order reversed
obs | observation | |
sequence_length | length of sequence | |
start | start | |
p_order | order | |
max_val | maximum value |
Definition at line 1417 of file StringFeatures.h.
void CStringFeatures< ST >::unembed_word | ( | ST | word, | |
uint8_t * | seq, | |||
int32_t | len | |||
) |
remap bit-based word to character sequence
word | word to remap | |
seq | sequence of size len that remapped characters are written to | |
len | length of sequence and word |
Definition at line 1318 of file StringFeatures.h.
CAlphabet* CStringFeatures< ST >::alphabet [protected] |
alphabet
Definition at line 1602 of file StringFeatures.h.
T_STRING<ST>* CStringFeatures< ST >::features [protected] |
this contains the array of features.
Definition at line 1608 of file StringFeatures.h.
int32_t CStringFeatures< ST >::length_of_single_string [protected] |
length of prior single string
Definition at line 1614 of file StringFeatures.h.
int32_t CStringFeatures< ST >::max_string_length [protected] |
length of longest string
Definition at line 1617 of file StringFeatures.h.
floatmax_t CStringFeatures< ST >::num_symbols [protected] |
number of used symbols
Definition at line 1620 of file StringFeatures.h.
int32_t CStringFeatures< ST >::num_vectors [protected] |
number of string vectors
Definition at line 1605 of file StringFeatures.h.
int32_t CStringFeatures< ST >::order [protected] |
order used in higher order mapping
Definition at line 1626 of file StringFeatures.h.
floatmax_t CStringFeatures< ST >::original_num_symbols [protected] |
original number of used symbols (before higher order mapping)
Definition at line 1623 of file StringFeatures.h.
ST* CStringFeatures< ST >::single_string [protected] |
true when single string / created by sliding window
Definition at line 1611 of file StringFeatures.h.
ST* CStringFeatures< ST >::symbol_mask_table [protected] |
order used in higher order mapping
Definition at line 1629 of file StringFeatures.h.