MindyGramFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006 Konrad Rieck
00008  * Written (W) 2006-2009 Soeren Sonnenburg
00009  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/config.h"
00013 
00014 #ifdef HAVE_MINDY
00015 
00016 #ifndef _CMINDYGRAMFEATURES__H__
00017 #define _CMINDYGRAMFEATURES__H__
00018 
00019 #include "features/Features.h"
00020 #include "features/StringFeatures.h"
00021 
00022 #include <mindy.h>
00023 
00025 class CMindyGramFeatures : public CFeatures
00026 {
00027     public:
00035         CMindyGramFeatures(char *aname, char *embed, char *delim, uint8_t nlen)
00036         : CFeatures(0)
00037         {
00038             ASSERT(aname && embed && delim);
00039 
00040             /* Allocate and generate gram configuration (words) */
00041             SG_DEBUG( "Initializing Mindy gram features\n");
00042             if (nlen == 0)
00043                 cfg = micfg_words(alph_get_type(aname), delim);
00044             else
00045                 cfg = micfg_ngrams(alph_get_type(aname), (byte_t) nlen);
00046 
00047             /* Set delimiters */
00048             if (strlen(delim) > 0)
00049                 micfg_set_delim(cfg, delim);
00050             
00051             /* Set embedding */
00052             if (!strcasecmp(embed, "freq"))
00053                 micfg_set_embed(cfg, ME_FREQ);
00054             else if (!strcasecmp(embed, "count"))
00055                 micfg_set_embed(cfg, ME_COUNT);
00056             else if (!strcasecmp(embed, "bin"))
00057                 micfg_set_embed(cfg, ME_BIN);
00058             else
00059                 SG_ERROR("Unknown embedding mode '%s'", embed);
00060 
00061             if (nlen == 0)
00062                SG_INFO("Mindy in word mode (d: '%s', a: %s, e: %s)\n",
00063                        delim, aname, micfg_get_embed(cfg->embed));
00064             else  
00065                SG_INFO("Mindy in n-gram mode (n: '%d', a: %s, e: %s)\n",
00066                        nlen, aname, micfg_get_embed(cfg->embed));
00067         }
00068 
00073 #if 0
00074          CMindyGramFeatures(const CMindyGramFeatures & orig) : CFeatures(orig)
00075          {
00076                  SG_DEBUG( "Duplicating Mindy gram features\n");
00077                  num_vectors = orig.num_vectors;
00078 
00079                  /* Clone configuration */
00080                  cfg = micfg_clone(orig.cfg);
00081 
00082                  /* Clone gram vectors */
00083                  vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00084                  for (int32_t i = 0; i < num_vectors; i++)
00085                          vectors[i] = gram_clone(orig.vectors[i]);
00086                 }
00087 #endif
00088 
00094         template <class T> 
00095         bool import_features(CStringFeatures<T> *sf)
00096         {
00097             int32_t i;
00098             num_vectors = sf->get_num_vectors();
00099             SG_INFO( "Importing %ld string features\n", num_vectors);
00100 
00101             vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00102             if (!vectors) {
00103                 SG_ERROR( "Could not allocate memory\n");
00104                 return false;
00105             }
00106 
00107             for (i = 0; i < num_vectors; i++) {
00108                 int32_t len;
00109                 T *s = sf->get_feature_vector(i, len);
00110                 vectors[i] = gram_extract(cfg, (byte_t *) s, (size_t) len);
00111 
00112                 SG_DEBUG( "Extracted gram vector %d: %d grams\n", i, 
00113                         vectors[i]->num);
00114             }
00115 
00116             return true;
00117         }
00118 
00119         /* Destructors */
00120         virtual ~CMindyGramFeatures();
00121 
00122         CFeatures *duplicate() const;
00123 
00124         /* Feature and vector functions */
00125         gram_t *get_feature_vector(int32_t i);
00126         void set_feature_vector(int32_t i, gram_t * g);
00127         uint64_t get_feature(int32_t i, int32_t j);
00128         int32_t get_vector_length(int32_t i);
00129         void trim_max(float64_t m);
00130 
00131         /* Simple functions */
00132         virtual int32_t get_num_vectors() { return num_vectors; }
00133         virtual int32_t get_size() { return sizeof(gram_t *); }
00134         EFeatureClass get_feature_class() { return C_MINDYGRAM; }
00135         EFeatureType get_feature_type() { return F_ULONG; }
00136 
00138         inline virtual const char* get_name() const { return "MindyGramFeatures"; }
00139 
00140     protected:
00141         /* Import and load functions */
00142         virtual bool load(char * fname);
00143 
00144     private:
00146         int32_t num_vectors;
00148         gram_t **vectors;
00150         micfg_t *cfg;
00151 };
00152 #endif
00153 #endif

SHOGUN Machine Learning Toolbox - Documentation