MindyGramFeatures.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/config.h"
00013
00014 #ifdef HAVE_MINDY
00015
00016 #ifndef _CMINDYGRAMFEATURES__H__
00017 #define _CMINDYGRAMFEATURES__H__
00018
00019 #include "features/Features.h"
00020 #include "features/StringFeatures.h"
00021
00022 #include <mindy.h>
00023
00025 class CMindyGramFeatures : public CFeatures
00026 {
00027 public:
00035 CMindyGramFeatures(char *aname, char *embed, char *delim, uint8_t nlen)
00036 : CFeatures(0)
00037 {
00038 ASSERT(aname && embed && delim);
00039
00040
00041 SG_DEBUG( "Initializing Mindy gram features\n");
00042 if (nlen == 0)
00043 cfg = micfg_words(alph_get_type(aname), delim);
00044 else
00045 cfg = micfg_ngrams(alph_get_type(aname), (byte_t) nlen);
00046
00047
00048 if (strlen(delim) > 0)
00049 micfg_set_delim(cfg, delim);
00050
00051
00052 if (!strcasecmp(embed, "freq"))
00053 micfg_set_embed(cfg, ME_FREQ);
00054 else if (!strcasecmp(embed, "count"))
00055 micfg_set_embed(cfg, ME_COUNT);
00056 else if (!strcasecmp(embed, "bin"))
00057 micfg_set_embed(cfg, ME_BIN);
00058 else
00059 SG_ERROR("Unknown embedding mode '%s'", embed);
00060
00061 if (nlen == 0)
00062 SG_INFO("Mindy in word mode (d: '%s', a: %s, e: %s)\n",
00063 delim, aname, micfg_get_embed(cfg->embed));
00064 else
00065 SG_INFO("Mindy in n-gram mode (n: '%d', a: %s, e: %s)\n",
00066 nlen, aname, micfg_get_embed(cfg->embed));
00067 }
00068
00073 #if 0
00074 CMindyGramFeatures(const CMindyGramFeatures & orig) : CFeatures(orig)
00075 {
00076 SG_DEBUG( "Duplicating Mindy gram features\n");
00077 num_vectors = orig.num_vectors;
00078
00079
00080 cfg = micfg_clone(orig.cfg);
00081
00082
00083 vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00084 for (int32_t i = 0; i < num_vectors; i++)
00085 vectors[i] = gram_clone(orig.vectors[i]);
00086 }
00087 #endif
00088
00094 template <class T>
00095 bool import_features(CStringFeatures<T> *sf)
00096 {
00097 int32_t i;
00098 num_vectors = sf->get_num_vectors();
00099 SG_INFO( "Importing %ld string features\n", num_vectors);
00100
00101 vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00102 if (!vectors) {
00103 SG_ERROR( "Could not allocate memory\n");
00104 return false;
00105 }
00106
00107 for (i = 0; i < num_vectors; i++) {
00108 int32_t len;
00109 T *s = sf->get_feature_vector(i, len);
00110 vectors[i] = gram_extract(cfg, (byte_t *) s, (size_t) len);
00111
00112 SG_DEBUG( "Extracted gram vector %d: %d grams\n", i,
00113 vectors[i]->num);
00114 }
00115
00116 return true;
00117 }
00118
00119
00120 virtual ~CMindyGramFeatures();
00121
00122 CFeatures *duplicate() const;
00123
00124
00125 gram_t *get_feature_vector(int32_t i);
00126 void set_feature_vector(int32_t i, gram_t * g);
00127 uint64_t get_feature(int32_t i, int32_t j);
00128 int32_t get_vector_length(int32_t i);
00129 void trim_max(float64_t m);
00130
00131
00132 virtual int32_t get_num_vectors() { return num_vectors; }
00133 virtual int32_t get_size() { return sizeof(gram_t *); }
00134 EFeatureClass get_feature_class() { return C_MINDYGRAM; }
00135 EFeatureType get_feature_type() { return F_ULONG; }
00136
00138 inline virtual const char* get_name() const { return "MindyGramFeatures"; }
00139
00140 protected:
00141
00142 virtual bool load(char * fname);
00143
00144 private:
00146 int32_t num_vectors;
00148 gram_t **vectors;
00150 micfg_t *cfg;
00151 };
00152 #endif
00153 #endif