Template class StringFeatures implements a list of strings.
As this class is a template the underlying storage type is quite arbitrary and not limited to character strings, but could also be sequences of floating point numbers etc. Strings differ from matrices (cf. CSimpleFeatures) in a way that the dimensionality of the feature vectors (i.e. the strings) is not fixed; it may vary between strings.
Most string kernels require StringFeatures but a number of them actually requires strings to have same length.
When preprocessors are attached to string features they may shorten the string, but are not allowed to return strings longer than max_string_length, as some algorithms depend on this.
Also note that string features cannot currently be computed on-the-fly.
在文件StringFeatures.h第127行定义。
公有成员 | |
CStringFeatures () | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, EAlphabet alpha) | |
CStringFeatures (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, CAlphabet *alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (CFile *loader, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
virtual void | cleanup () |
virtual void | cleanup_feature_vector (int32_t num) |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
void | get_feature_vector (ST **dst, int32_t *len, int32_t num) |
void | set_feature_vector (ST *src, int32_t len, int32_t num) |
void | enable_on_the_fly_preprocessing () |
void | disable_on_the_fly_preprocessing () |
ST * | get_feature_vector (int32_t num, int32_t &len, bool &dofree) |
CStringFeatures< ST > * | get_transposed () |
T_STRING< ST > * | get_transposed (int32_t &num_feat, int32_t &num_vec) |
void | free_feature_vector (ST *feat_vec, int32_t num, bool dofree) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual void | load (CFile *loader) |
void | load_ascii_file (char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
bool | set_features (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
bool | append_features (CStringFeatures< ST > *sf) |
bool | append_features (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
virtual T_STRING< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual T_STRING< ST > * | copy_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (T_STRING< ST > **dst, int32_t *num_str) |
virtual void | save (CFile *writer) |
virtual bool | load_compressed (char *src, bool decompress) |
virtual bool | save_compressed (char *dest, E_COMPRESSION_TYPE compression, int level) |
virtual int32_t | get_size () |
virtual bool | apply_preproc (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
virtual void | get_histogram (float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true) |
virtual void | create_random (float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec) |
virtual const char * | get_name () const |
静态公有成员 | |
static ST * | get_zero_terminated_string_copy (T_STRING< ST > str) |
保护成员 | |
virtual ST * | compute_feature_vector (int32_t num, int32_t &len) |
保护属性 | |
CAlphabet * | alphabet |
alphabet | |
int32_t | num_vectors |
number of string vectors | |
T_STRING< ST > * | features |
this contains the array of features. | |
ST * | single_string |
true when single string / created by sliding window | |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
length of longest string | |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping | |
bool | preprocess_on_get |
preprocess on-the-fly? | |
CCache< ST > * | feature_cache |
CStringFeatures | ( | ) |
default constructor
在文件StringFeatures.h第133行定义。
CStringFeatures | ( | EAlphabet | alpha ) |
CStringFeatures | ( | T_STRING< ST > * | p_features, |
int32_t | p_num_vectors, | ||
int32_t | p_max_string_length, | ||
EAlphabet | alpha | ||
) |
constructor
p_features | new features |
p_num_vectors | number of vectors |
p_max_string_length | maximum string length |
alpha | alphabet (type) to use for string features |
在文件StringFeatures.h第163行定义。
CStringFeatures | ( | T_STRING< ST > * | p_features, |
int32_t | p_num_vectors, | ||
int32_t | p_max_string_length, | ||
CAlphabet * | alpha | ||
) |
constructor
p_features | new features |
p_num_vectors | number of vectors |
p_max_string_length | maximum string length |
alpha | an actual alphabet |
在文件StringFeatures.h第184行定义。
CStringFeatures | ( | CAlphabet * | alpha ) |
CStringFeatures | ( | const CStringFeatures< ST > & | orig ) |
copy constructor
在文件StringFeatures.h第216行定义。
CStringFeatures | ( | CFile * | loader, |
EAlphabet | alpha = DNA |
||
) |
constructor
loader | File object via which to load data |
alpha | alphabet (type) to use for string features |
在文件StringFeatures.h第256行定义。
virtual ~CStringFeatures | ( | ) | [virtual] |
在文件StringFeatures.h第268行定义。
bool append_features | ( | CStringFeatures< ST > * | sf ) |
bool append_features | ( | T_STRING< ST > * | p_features, |
int32_t | p_num_vectors, | ||
int32_t | p_max_string_length | ||
) |
append features
p_features | features to append |
p_num_vectors | number of vectors |
p_max_string_length | maximum string length |
note that p_features will be delete[]'d on success
在文件StringFeatures.h第1139行定义。
virtual bool apply_preproc | ( | bool | force_preprocessing = false ) |
[virtual] |
apply preprocessor
force_preprocessing | if preprocssing shall be forced |
在文件StringFeatures.h第1411行定义。
virtual void cleanup | ( | ) | [virtual] |
virtual void cleanup_feature_vector | ( | int32_t | num ) | [virtual] |
virtual ST* compute_feature_vector | ( | int32_t | num, |
int32_t & | len | ||
) | [protected, virtual] |
compute feature vector for sample num if target is set the vector is written to target len is returned by reference
default implementation returns
num | which vector |
len | length of vector |
在文件StringFeatures.h第1926行定义。
void compute_symbol_mask_table | ( | int64_t | max_val ) |
virtual T_STRING<ST>* copy_features | ( | int32_t & | num_str, |
int32_t & | max_str_len | ||
) | [virtual] |
copy_features
num_str | number of strings (returned) |
max_str_len | maximal string length (returned) |
在文件StringFeatures.h第1208行定义。
virtual void create_random | ( | float64_t * | hist, |
int32_t | rows, | ||
int32_t | cols, | ||
int32_t | num_vec | ||
) | [virtual] |
create some random strings based on normalized histogram
在文件StringFeatures.h第1879行定义。
void determine_maximum_string_length | ( | ) |
determine new maximum string length
在文件StringFeatures.h第1790行定义。
void disable_on_the_fly_preprocessing | ( | ) |
call this to disable on the fly feature preprocessing on get_feature_vector. Useful when you manually apply preprocessors.
在文件StringFeatures.h第410行定义。
virtual CFeatures* duplicate | ( | ) | const [virtual] |
void embed_features | ( | int32_t | p_order ) |
embed string features in bit representation in-place
在文件StringFeatures.h第1669行定义。
ST embed_word | ( | ST * | seq, |
int32_t | len | ||
) |
void enable_on_the_fly_preprocessing | ( | ) |
call this to preprocess string features upon get_feature_vector
在文件StringFeatures.h第402行定义。
void free_feature_vector | ( | ST * | feat_vec, |
int32_t | num, | ||
bool | dofree | ||
) |
free feature vector
feat_vec | feature vector to free |
num | index in feature cache |
dofree | if vector should be really deleted |
在文件StringFeatures.h第519行定义。
CAlphabet* get_alphabet | ( | ) |
virtual ST get_feature | ( | int32_t | vec_num, |
int32_t | feat_num | ||
) | [virtual] |
virtual EFeatureClass get_feature_class | ( | ) | [virtual] |
virtual EFeatureType get_feature_type | ( | ) | [virtual] |
ST* get_feature_vector | ( | int32_t | num, |
int32_t & | len, | ||
bool & | dofree | ||
) |
get feature vector for sample num
num | index of feature vector |
len | length is returned by reference |
dofree | whether returned vector must be freed by caller via free_feature_vector |
在文件StringFeatures.h第423行定义。
void get_feature_vector | ( | ST ** | dst, |
int32_t * | len, | ||
int32_t | num | ||
) |
get string for selected example num
dst | destination where vector will be stored |
len | number of features in vector |
num | index of the string |
在文件StringFeatures.h第354行定义。
virtual T_STRING<ST>* get_features | ( | int32_t & | num_str, |
int32_t & | max_str_len | ||
) | [virtual] |
get_features
num_str | number of strings (returned) |
max_str_len | maximal string length (returned) |
在文件StringFeatures.h第1195行定义。
virtual void get_features | ( | T_STRING< ST > ** | dst, |
int32_t * | num_str | ||
) | [virtual] |
get_features (swig compatible)
dst | string features (returned) |
num_str | number of strings (returned) |
在文件StringFeatures.h第1235行定义。
virtual void get_histogram | ( | float64_t ** | hist, |
int32_t * | rows, | ||
int32_t * | cols, | ||
bool | normalize = true |
||
) | [virtual] |
compute histogram over strings
在文件StringFeatures.h第1834行定义。
ST get_masked_symbols | ( | ST | symbol, |
uint8_t | mask | ||
) |
a higher order mapped symbol will be shaped such that the symbols specified by bits in the mask will be returned.
symbol | symbol to mask |
mask | mask to apply |
在文件StringFeatures.h第613行定义。
floatmax_t get_max_num_symbols | ( | ) |
get maximum number of symbols
Note: floatmax_t sounds weird, but int64_t is not long enough (and there is no int128_t type)
在文件StringFeatures.h第590行定义。
virtual int32_t get_max_vector_length | ( | ) | [virtual] |
virtual const char* get_name | ( | ) | const [virtual] |
floatmax_t get_num_symbols | ( | ) |
get number of symbols
Note: floatmax_t sounds weird, but LONG is not long enough
在文件StringFeatures.h第581行定义。
virtual int32_t get_num_vectors | ( | ) | [virtual] |
int32_t get_order | ( | ) |
floatmax_t get_original_num_symbols | ( | ) |
number of symbols before higher order mapping
在文件StringFeatures.h第598行定义。
virtual int32_t get_size | ( | ) | [virtual] |
get memory footprint of one feature
实现了CFeatures。
在文件StringFeatures.h第1404行定义。
T_STRING<ST>* get_transposed | ( | int32_t & | num_feat, |
int32_t & | num_vec | ||
) |
compute and return the transpose of string features matrix which will be prepocessed. num_feat, num_vectors are returned by reference caller has to clean up
note that strings all have to have same length
num_feat | number of features in matrix |
num_vec | number of vectors in matrix |
在文件StringFeatures.h第482行定义。
CStringFeatures<ST>* get_transposed | ( | ) |
virtual int32_t get_vector_length | ( | int32_t | vec_num ) | [virtual] |
static ST* get_zero_terminated_string_copy | ( | T_STRING< ST > | str ) | [static] |
get a zero terminated copy of the string
str | the string to copy |
note that this function is only sensible for character strings
在文件StringFeatures.h第1805行定义。
bool have_same_length | ( | int32_t | len = -1 ) |
check if length of each vector in this feature object equals the given length.
len | vector length to check against |
在文件StringFeatures.h第1647行定义。
virtual void load | ( | CFile * | loader ) | [virtual] |
void load_ascii_file | ( | char * | fname, |
bool | remap_to_bin = true , |
||
EAlphabet | ascii_alphabet = DNA , |
||
EAlphabet | binary_alphabet = RAWDNA |
||
) |
load ascii line-based string features from file
fname | filename to load from |
remap_to_bin | if translation to other binary alphabet should be performed |
ascii_alphabet | src alphabet |
binary_alphabet | alphabet to translate to |
在文件StringFeatures.h第657行定义。
virtual bool load_compressed | ( | char * | src, |
bool | decompress | ||
) | [virtual] |
load compressed features from file
src | filename to load from |
decompress | whether to decompress on loading |
在文件StringFeatures.h第1255行定义。
bool load_fasta_file | ( | const char * | fname, |
bool | ignore_invalid = false |
||
) |
load fasta file as string features
fname | filename to load from |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
在文件StringFeatures.h第796行定义。
bool load_fastq_file | ( | const char * | fname, |
bool | ignore_invalid = false , |
||
bool | bitremap_in_single_string = false |
||
) |
load fastq file as string features
fname | filename to load from |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
bitremap_in_single_string | if set to true, do binary embedding of symbols |
在文件StringFeatures.h第895行定义。
bool load_from_directory | ( | char * | dirname ) |
load features from directory
dirname | directory name to load from |
在文件StringFeatures.h第1000行定义。
int32_t obtain_by_position_list | ( | int32_t | window_size, |
CDynamicArray< int32_t > * | positions, | ||
int32_t | skip = 0 |
||
) |
extracts windows of size window_size from first string using the positions in list
window_size | window size |
positions | positions |
skip | skip |
在文件StringFeatures.h第1486行定义。
int32_t obtain_by_sliding_window | ( | int32_t | window_size, |
int32_t | step_size, | ||
int32_t | skip = 0 |
||
) |
slides a window of size window_size over the current single string step_size is the amount by which the window is shifted. creates (string_len-window_size)/step_size many feature obj if skip is nonzero, skip the first 'skip' characters of each string
window_size | window size |
step_size | step size |
skip | skip |
在文件StringFeatures.h第1444行定义。
bool obtain_from_char | ( | CStringFeatures< char > * | sf, |
int32_t | start, | ||
int32_t | p_order, | ||
int32_t | gap, | ||
bool | rev | ||
) |
obtain string features from char features
wrapper for template method
sf | string features |
start | start |
p_order | order |
gap | gap |
rev | reverse |
在文件StringFeatures.h第1551行定义。
bool obtain_from_char_features | ( | CStringFeatures< CT > * | sf, |
int32_t | start, | ||
int32_t | p_order, | ||
int32_t | gap, | ||
bool | rev | ||
) |
template obtain from char features
sf | string features |
start | start |
p_order | order |
gap | gap |
rev | reverse |
在文件StringFeatures.h第1566行定义。
virtual void save | ( | CFile * | writer ) | [virtual] |
virtual bool save_compressed | ( | char * | dest, |
E_COMPRESSION_TYPE | compression, | ||
int | level | ||
) | [virtual] |
save compressed features to file
dest | filename to save to |
compression | compressor to use |
level | compression level to use (1-9) |
在文件StringFeatures.h第1342行定义。
void set_feature_vector | ( | ST * | src, |
int32_t | len, | ||
int32_t | num | ||
) |
set string for selected example num
src | destination where vector will be stored |
len | number of features in vector |
num | index of the string |
在文件StringFeatures.h第379行定义。
virtual void set_feature_vector | ( | int32_t | num, |
ST * | string, | ||
int32_t | len | ||
) | [virtual] |
set feature vector for sample num
num | index of feature vector |
string | string with the feature vector's content |
len | length of the string |
在文件StringFeatures.h第1820行定义。
bool set_features | ( | T_STRING< ST > * | p_features, |
int32_t | p_num_vectors, | ||
int32_t | p_max_string_length | ||
) |
set features
p_features | new features |
p_num_vectors | number of vectors |
p_max_string_length | maximum string length |
在文件StringFeatures.h第1074行定义。
ST shift_offset | ( | ST | offset, |
int32_t | amount | ||
) |
shift offset to the left by amount
offset | offset to shift |
amount | amount to shift the offset |
在文件StringFeatures.h第625行定义。
ST shift_symbol | ( | ST | symbol, |
int32_t | amount | ||
) |
shift symbol to the right by amount (taking care of custom symbol sizes)
symbol | symbol to shift |
amount | amount to shift the symbol |
在文件StringFeatures.h第637行定义。
void unembed_word | ( | ST | word, |
uint8_t * | seq, | ||
int32_t | len | ||
) |
remap bit-based word to character sequence
word | word to remap |
seq | sequence of size len that remapped characters are written to |
len | length of sequence and word |
在文件StringFeatures.h第1754行定义。
alphabet
在文件StringFeatures.h第2026行定义。
CCache<ST>* feature_cache [protected] |
feature cache
在文件StringFeatures.h第2059行定义。
T_STRING<ST>* features [protected] |
this contains the array of features.
在文件StringFeatures.h第2032行定义。
int32_t length_of_single_string [protected] |
length of prior single string
在文件StringFeatures.h第2038行定义。
int32_t max_string_length [protected] |
length of longest string
在文件StringFeatures.h第2041行定义。
floatmax_t num_symbols [protected] |
number of used symbols
在文件StringFeatures.h第2044行定义。
int32_t num_vectors [protected] |
number of string vectors
在文件StringFeatures.h第2029行定义。
int32_t order [protected] |
order used in higher order mapping
在文件StringFeatures.h第2050行定义。
floatmax_t original_num_symbols [protected] |
original number of used symbols (before higher order mapping)
在文件StringFeatures.h第2047行定义。
bool preprocess_on_get [protected] |
preprocess on-the-fly?
在文件StringFeatures.h第2056行定义。
ST* single_string [protected] |
true when single string / created by sliding window
在文件StringFeatures.h第2035行定义。
ST* symbol_mask_table [protected] |
order used in higher order mapping
在文件StringFeatures.h第2053行定义。