This page lists ready to run shogun examples for the Python Modular interface.
To run the examples issue
python name_of_example.py
# This example shows how to use a custom defined kernel function for training a # two class Support Vector Machine (SVM) classifier on a randomly generated # examples. The SVM regularization constant is set to C=1. from numpy import * from numpy.random import rand from shogun.Features import RealFeatures, Labels from shogun.Kernel import CustomKernel from shogun.Classifier import LibSVM C=1 dim=7 lab=sign(2*rand(dim) - 1) data=rand(dim, dim) symdata=data*data.T kernel=CustomKernel() kernel.set_full_kernel_matrix_from_full(data) labels=Labels(lab) svm=LibSVM(C, kernel, labels) svm.train() out=svm.classify().get_labels()
# In this example we demonstrate how to use SVMs in a domain adaptation # scenario. Here, we assume that we have two problem domains, one with # an abundance of training data (source domain) and one with only a few # training examples (target domain). These domains are assumed to be # different but related enough to transfer information between them. # Thus, we first train an SVM on the source domain and then subsequently # pass this previously trained SVM object to the DASVM, that we train # on the target domain. The DASVM internally computes a custom linear term # (for the underlying quadratic program of the dual formulation of the SVM) # based on the support vectors of the source SVM and the training examples # of the target SVM. Finally, it can be used for prediction just as any other # SVM object. # import numpy from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight, DomainAdaptationSVM degree=3 fm_train_dna = ['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_train_dna = numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna = ['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna = numpy.array(5*[-1.0] + 5*[1.0]) fm_train_dna2 = ['AGACAGTCAGTCGATAGCT', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAACACGTTCTCTCGTC', 'AGCAGTCGTAGTCGTAGTC', 'AGCAGTCGTTTTTTTAGTC', 'AGCAGTCGTAGTCGAAAAC', 'ACCCCCCCCCCCCCCCCTC', 'AGCAGGGGGGGGGGGAGTC'] label_train_dna2 = numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna2 = ['CGACAGTCAGTCGATAGCT', 'CGCAGTCGTAGTCGTAGTC', 'ACCAGGGGGGGGGGTAGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCCACACGTTCTCTCGTC', 'AGCAATCGTAGTCGTAGTC', 'AGCAGTGGGGTTTTTAGTC', 'AGCAGTCGTAAACGAAAAC', 'ACCCCCCCCCCCCAACCTC', 'AGCAGGAAGGGGGGGAGTC'] label_test_dna2 = numpy.array(5*[-1.0] + 5*[1.0]) C = 1.0 feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = Labels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() ##################################### print "obtaining DA SVM from previously trained SVM" feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = Labels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.classify(feats_test2).get_labels() print out
# In this example a multi-class support vector machine is trained on a toy data # set and the trained classifier is then used to predict labels of test # examples. The training algorithm is based on BSVM formulation (L2-soft margin # and the bias added to the objective function) which is solved by the Improved # Mitchell-Demyanov-Malozemov algorithm. The training algorithm uses the Gaussian # kernel of width 2.1 and the regularization constant C=1. The solver stops if the # relative duality gap falls below 1e-5. # # For more details on the used SVM solver see # V.Franc: Optimization Algorithms for Kernel Methods. Research report. # CTU-CMP-2005-22. CTU FEL Prague. 2005. # ftp://cmp.felk.cvut.cz/pub/cmp/articles/franc/Franc-PhD.pdf . # def gmnpsvm (): print 'GMNPSVM' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import GMNPSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_multiclass) svm=GMNPSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train(feats_train) #kernel.init(feats_train, feats_test) out=svm.classify(feats_test).get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat') gmnpsvm()
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is then used to predict labels of test # examples. As training algorithm Gradient Projection Decomposition Technique # (GPDT) is used with SVM regularization parameter C=1 and a Gaussian # kernel of width 2.1. The solver returns an epsilon-precise (epsilon=1e-5) solution. # # For more details on GPDT solver see http://dm.unife.it/gpdt . # def gpbtsvm (): print 'GPBTSVM' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import GPBTSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_twoclass) svm=GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') gpbtsvm()
# This example shows usage of a k-nearest neighbor (KNN) classification rule on # a toy data set. The number of the nearest neighbors is set to k=3 and the distances # are measured by the Euclidean metric. Finally, the KNN rule is applied to predict # labels of test examples. def knn (): print 'KNN' from shogun.Features import RealFeatures, Labels from shogun.Classifier import KNN from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) k=3 labels=Labels(label_train_multiclass) knn=KNN(k, distance, labels) knn.train() output=knn.classify(feats_test).get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat') knn()
# In this example a multi-class support vector machine classifier is trained on a # toy data set and the trained classifier is then used to predict labels of test # examples. As training algorithm the LaRank algorithm is used with SVM # regularization parameter C=1 and a Gaussian kernel of width 2.1 and a precision # set to epsilon=1e-5. # # For more details on LaRank see # Bordes, A. and Bottou, L. and Gallinari, P. and Weston, J. # Solving MultiClass Support Vector Machines with LaRank. ICML 2007. # def larank (): print 'LaRank' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LaRank feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_multiclass) svm=LaRank(C, kernel, labels) #svm.set_tau(1e-3) #svm.set_batch_mode(False) #svm.io.enable_progress() svm.set_epsilon(epsilon) svm.train() out=svm.classify(feats_train).get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat') larank()
# In this example a two-class linear classifier based on the Linear Discriminant # Analysis (LDA) is trained on a toy data set and then the trained classifier is # used to predict test examples. The regularization parameter, which corresponds # to a weight of a unitary matrix added to the covariance matrix, is set to # gamma=3. # # For more details on the LDA see e.g. # http://en.wikipedia.org/wiki/Linear_discriminant_analysis def lda (): print 'LDA' from shogun.Features import RealFeatures, Labels from shogun.Classifier import LDA feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) gamma=3 num_threads=1 labels=Labels(label_train_twoclass) lda=LDA(gamma, feats_train, labels) lda.train() lda.get_bias() lda.get_w() lda.set_features(feats_test) lda.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') lda()
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is then used to predict labels of # test examples. As training algorithm the LIBLINEAR solver is used with the SVM # regularization parameter C=0.9 and the bias in the classification rule switched # on and the precision parameters epsilon=1e-5. # # For more details on LIBLINEAR see # http://www.csie.ntu.edu.tw/~cjlin/liblinear/ def liblinear (): print 'LibLinear' from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import LibLinear realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C=0.9 epsilon=1e-5 num_threads=1 labels=Labels(label_train_twoclass) svm=LibLinear(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(True) svm.train() svm.set_features(feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') liblinear()
# In this example a two-class support vector machine classifier is trained on a # 2-dimensional randomly generated data set and the trained classifier is used to # predict labels of test examples. As training algorithm the LIBSVM solver is used # with SVM regularization parameter C=1 and a Gaussian kernel of width 2.1. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ from numpy import * from numpy.random import randn from shogun.Features import * from shogun.Classifier import * from shogun.Kernel import * num=1000 dist=1 width=2.1 C=1 traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); labels=Labels(trainlab); svm=LibSVM(C, kernel, labels); svm.train(); kernel.init(feats_train, feats_test); out=svm.classify().get_labels(); testerr=mean(sign(out)!=testlab) print testerr
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the LIBSVM solver is used with SVM # regularization parameter C=1 and a Gaussian kernel of width 2.1 and the # precision parameter epsilon=1e-5. The example also shows how to retrieve the # support vectors from the train SVM model. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ def libsvm (): print 'LibSVM' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_twoclass) svm=LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() sv_idx=svm.get_support_vectors() alphas=svm.get_alphas() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') libsvm()
# In this example a multi-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the LIBSVM solver is used with SVM # regularization parameter C=1 and a Gaussian kernel of width 2.1 and the # precision parameter epsilon=1e-5. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ def libsvm_multiclass (): print 'LibSVMMultiClass' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVMMultiClass feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_multiclass) svm=LibSVMMultiClass(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat') libsvm_multiclass()
# In this example a one-class support vector machine classifier is trained on a # toy data set. The training algorithm finds a hyperplane in the RKHS which # separates the training data from the origin. The one-class classifier is # typically used to estimate the support of a high-dimesnional distribution. # For more details see e.g. # B. Schoelkopf et al. Estimating the support of a high-dimensional # distribution. Neural Computation, 13, 2001, 1443-1471. # # In the example, the one-class SVM is trained by the LIBSVM solver with the # regularization parameter C=1 and the Gaussian kernel of width 2.1 and the # precision parameter epsilon=1e-5. # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ def libsvm_oneclass (): print 'LibSVMOneClass' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVMOneClass feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 svm=LibSVMOneClass(C, kernel) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') libsvm_oneclass()
# In this example a two-class support vector machine classifier is trained on a # toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Minimal Primal Dual SVM is used with SVM # regularization parameter C=1 and a Gaussian kernel of width 1.2 and the # precision parameter 1e-5. # # For more details on the MPD solver see # Kienzle, W. and B. Schölkopf: Training Support Vector Machines with Multiple # Equality Constraints. Machine Learning: ECML 2005, 182-193. (Eds.) Carbonell, # J. G., J. Siekmann, Springer, Berlin, Germany (11 2005) def mpdsvm (): print 'MPDSVM' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import MPDSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_twoclass) svm=MPDSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') mpdsvm()
# This example shows usage of the Perceptron algorithm for training a two-class # linear classifier, i.e. y = sign( <x,w>+b). The Perceptron algorithm works by # iteratively passing though the training examples and applying the update rule on # those examples which are misclassified by the current classifier. The Perceptron # update rule reads # # w(t+1) = w(t) + alpha * y_t * x_t # b(t+1) = b(t) + alpha * y_t # # where (x_t,y_t) is feature vector and label (must be +1/-1) of the misclassified example # (w(t),b(t)) are the current parameters of the linear classifier # (w(t+1),b(t+1)) are the new parameters of the linear classifier # alpha is the learning rate; in this examples alpha=1 # # The Perceptron algorithm iterates until all training examples are correctly # classified or the prescribed maximal number of iterations, in this example # max_iter=1000, is reached. def perceptron (): print 'Perceptron' from shogun.Features import RealFeatures, Labels from shogun.Classifier import Perceptron feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) learn_rate=1. max_iter=1000 num_threads=1 labels=Labels(label_train_twoclass) perceptron=Perceptron(feats_train, labels) perceptron.set_learn_rate(learn_rate) perceptron.set_max_iter(max_iter) # only guaranteed to converge for separable data perceptron.train() perceptron.set_features(feats_test) perceptron.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') perceptron()
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the steepest descent subgradient algorithm is # used. The SVM regularization parameter is set to C=0.9 and the bias in the # classification rule is switched off. The solver iterates until it finds an # epsilon-precise solution (epsilon=1e-3) or the maximal training time # max_train_time=1 (seconds) is exceeded. The unbiased linear rule is trained. # # Note that this solver often does not converges because the steepest descent # subgradient algorithm is oversensitive to rounding errors. Note also that this # is an unpublished work which was predecessor of the OCAS solver (see # classifier_svmocas). def subgradient_svm (): print 'SubGradientSVM' from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import SubGradientSVM realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C=0.9 epsilon=1e-3 num_threads=1 max_train_time=1. labels=Labels(label_train_twoclass) svm=SubGradientSVM(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.set_max_train_time(max_train_time) svm.train() svm.set_features(feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') subgradient_svm()
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1 and the Weighted Degree kernel of the degree 20 and # a precision parameter epsilon=1e-5. The LINADD trick is used to speed up # training. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel and the LINADD trick see # Sonnenburg, s. and Rätsch, G. and Rieck, K. Large Scale Learning with String # Kernels. In Bottou, Leon and Chapelle, Olivier and DeCoste, Dennis and Weston, # Jason, editor, In Large Scale Kernel Machines, pages 73-103, MIT Press, # Cambridge, MA. 2007. # def do_batch_linadd (): print 'SVMlight batch' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=1 epsilon=1e-5 num_threads=2 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print 'SVMLight Objective: %f num_sv: %d' % \ # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.classify().get_labels() svm.set_batch_computation_enabled(True) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') do_batch_linadd()
# This example demonstrates how to train an SVMLight classifier # using a custom linear term. This is used in the class DASVM that # pre-computes this linear term using a previously trained SVM. # import numpy degree=3 fm_train_dna=['CGCACGTACGTAGCTCGAT', 'CGACGTAGTCGTAGTCGTA', 'CGACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACCACAGTTATATAGTA', 'CGACGTAGTCGTAGTCGTA', 'CGACGTAGTTTTTTTCGTA', 'CGACGTAGTCGTAGCCCCA', 'CAAAAAAAAAAAAAAAATA', 'CGACGGGGGGGGGGGCGTA'] label_train_dna=numpy.array(5*[-1.0] + 5*[1.0]) fm_test_dna=['AGCACGTACGTAGCTCGAT', 'AGACGTAGTCGTAGTCGTA', 'CAACGGGGGGGGGGTCGTA', 'CGACCTAGTCGTAGTCGTA', 'CGAACACAGTTATATAGTA', 'CGACCTAGTCGTAGTCGTA', 'CGACGTGGGGTTTTTCGTA', 'CGACGTAGTCCCAGCCCCA', 'CAAAAAAAAAAAACCAATA', 'CGACGGCCGGGGGGGCGTA'] label_test_dna=numpy.array(5*[-1.0] + 5*[1.0]) print 'SVMLight' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=10 epsilon=1e-5 num_threads=1 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.classify().get_labels()
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and # the precision parameter epsilon=1e-5. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel see # G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively # spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. def svm_light (): print 'SVMLight' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=1.2 epsilon=1e-5 num_threads=1 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') svm_light()
# In this example a two-class linear support vector machine classifier (SVM) is # trained on a toy data set and the trained classifier is used to predict labels # of test examples. As training algorithm the SVMLIN solver is used with the SVM # regularization parameter C=0.9 and the bias in the classification rule switched # on and the precision parameter epsilon=1e-5. The example also shows how to # retrieve parameters (vector w and bias b)) of the trained linear classifier. # # For more details on the SVMLIN solver see # V. Sindhwani, S.S. Keerthi. Newton Methods for Fast Solution of Semi-supervised # Linear SVMs. Large Scale Kernel Machines MIT Press (Book Chapter), 2007 def svmlin (): print 'SVMLin' from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import SVMLin realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C=0.9 epsilon=1e-5 num_threads=1 labels=Labels(label_train_twoclass) svm=SVMLin(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(True) svm.train() svm.set_features(feats_test) svm.get_bias() svm.get_w() svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') svmlin()
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the OCAS solver is used with the SVM # regularization parameter C=0.9 and the bias term in the classification rule # switched off and the precision parameter epsilon=1e-5 (duality gap). # # For more details on the OCAS solver see # V. Franc, S. Sonnenburg. Optimized Cutting Plane Algorithm for Large-Scale Risk # Minimization.The Journal of Machine Learning Research, vol. 10, # pp. 2157--2192. October 2009. # def svmocas (): print 'SVMOcas' from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import SVMOcas realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C=0.9 epsilon=1e-5 num_threads=1 labels=Labels(label_train_twoclass) svm=SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() svm.set_features(feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') svmocas()
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is # used with the SVM regularization parameter C=0.9. The number of iterations, i.e. # passes though all training examples, is set to num_iter=5 . # # For more details on the SGD solver see # L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT # Press. 2008. def svmsgd (): print 'SVMSGD' from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import SVMSGD realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C=0.9 num_threads=1 num_iter=5 labels=Labels(label_train_twoclass) svm=SVMSGD(C, feats_train, labels) svm.set_epochs(num_iter) #svm.io.set_loglevel(0) svm.train() svm.set_features(feats_test) svm.classify().get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_twoclass=lm.load_labels('../data/label_train_twoclass.dat') svmsgd()
# In this example an agglomerative hierarchical single linkage clustering method # is used to cluster a given toy data set. Starting with each object being # assigned to its own cluster clusters are iteratively merged. Here the clusters # are merged that have the closest (minimum distance, here set via the Euclidean # distance object) two elements. def hierarchical (): print 'Hierarchical' from shogun.Distance import EuclidianDistance from shogun.Features import RealFeatures from shogun.Clustering import Hierarchical merges=3 feats_train=RealFeatures(fm_train) distance=EuclidianDistance(feats_train, feats_train) hierarchical=Hierarchical(merges, distance) hierarchical.train() hierarchical.get_merge_distances() hierarchical.get_cluster_pairs() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train=lm.load_numbers('../data/fm_train_real.dat') hierarchical()
# In this example the k-means clustering method is used to cluster a given toy # data set. In k-means clustering one tries to partition n observations into k # clusters in which each observation belongs to the cluster with the nearest mean. # The algorithm class constructor takes the number of clusters and a distance to # be used as input. The distance used in this example is Euclidean distance. # After training one can fetch the result of clustering by obtaining the cluster # centers and their radiuses. #!/usr/bin/env python """ Explicit examples on how to use clustering """ def kmeans (): print 'KMeans' from shogun.Distance import EuclidianDistance from shogun.Features import RealFeatures from shogun.Clustering import KMeans k=3 feats_train=RealFeatures(fm_train) distance=EuclidianDistance(feats_train, feats_train) kmeans=KMeans(k, distance) kmeans.train() kmeans.get_cluster_centers() kmeans.get_radiuses() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train=lm.load_numbers('../data/fm_train_real.dat') kmeans()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CBrayCurtisDistance.html. # # Obviously, using the Bray Curtis distance is not limited to this showcase # example. def bray_curtis_distance (): print 'BrayCurtisDistance' from shogun.Features import RealFeatures from shogun.Distance import BrayCurtisDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=BrayCurtisDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') bray_curtis_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (dissimilarity ratio) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CCanberraMetric.html. # # Obviously, using the Canberra distance is not limited to this showcase # example. def canberra_metric (): print 'CanberaMetric' from shogun.Features import RealFeatures from shogun.Distance import CanberraMetric feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=CanberraMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') canberra_metric()
# This example shows how to compute the Canberra Word Distance. def canberra_word_distance (): print 'CanberraWordDistance' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import CanberraWordDistance order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') canberra_word_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (maximum of absolute feature dimension differences) matrix is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (maximum of absolute feature dimension differences) matrix between these # two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CChebyshewMetric.html. # # Obviously, using the Chebyshew distance is not limited to this showcase # example. def chebyshew_metric (): print 'ChebyshewMetric' from shogun.Features import RealFeatures from shogun.Distance import ChebyshewMetric feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=ChebyshewMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') chebyshew_metric()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CChiSquareDistance.html. # # Obviously, using the ChiSquare distance is not limited to this showcase # example. def chi_square_distance (): print 'ChiSquareDistance' from shogun.Features import RealFeatures from shogun.Distance import ChiSquareDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=ChiSquareDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') chi_square_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CCosineDistance.html. # # Obviously, using the Cosine distance is not limited to this showcase # example. def cosine_distance (): print 'CosineDistance' from shogun.Features import RealFeatures from shogun.Distance import CosineDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=CosineDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') cosine_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CEuclidianDistance.html. # # Obviously, using the Euclidian distance is not limited to this showcase # example. def euclidian_distance (): print 'EuclidianDistance' from shogun.Features import RealFeatures from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') euclidian_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a # pairwise distance (shortest path on a sphere) matrix is computed # by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (shortest path on a sphere) matrix between these two data sets is # computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CGeodesicMetric.html. # # Obviously, using the Geodesic distance is not limited to this showcase # example. def geodesic_metric (): print 'GeodesicMetric' from shogun.Features import RealFeatures from shogun.Distance import GeodesicMetric feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=GeodesicMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') geodesic_metric()
# This example shows how to compute the Hamming Word Distance for string features. def hamming_word_distance (): print 'HammingWordDistance' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import HammingWordDistance order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') hamming_word_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (divergence measure based on the Kullback-Leibler divergence) matrix # is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (divergence measure based on the Kullback-Leibler divergence) matrix between # these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CJensenMetric.html. # # Obviously, using the Jensen-Shannon distance/divergence is not limited to # this showcase example. def jensen_metric (): print 'JensenMetric' from shogun.Features import RealFeatures from shogun.Distance import JensenMetric feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=JensenMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') jensen_metric()
# This example shows how to compute the Manhatten Distance. def manhattan_metric (): print 'ManhattanMetric' from shogun.Features import RealFeatures from shogun.Distance import ManhattanMetric feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=ManhattanMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') manhattan_metric()
# This example shows how to compute the Manahattan Distance for string features. def manhattan_word_distance (): print 'ManhattanWordDistance' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import ManhattanWordDistance order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') manhattan_word_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) and norm 'k' controls the processing of the given data points, # where a pairwise distance matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # matrix between these two data sets is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CMinkowskiMetric.html. # # Obviously, using the Minkowski metric is not limited to this showcase # example. def minkowski_metric (): print 'MinkowskiMetric' from shogun.Features import RealFeatures from shogun.Distance import MinkowskiMetric feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) k=3 distance=MinkowskiMetric(feats_train, feats_train, k) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') minkowski_metric()
def norm_squared_distance (): from shogun.Features import RealFeatures from shogun.Distance import EuclidianDistance print 'EuclidianDistance - NormSquared' feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') norm_squared_distance()
def sparse_euclidian_distance (): print 'SparseEuclidianDistance' from shogun.Features import RealFeatures, SparseRealFeatures from shogun.Distance import SparseEuclidianDistance realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat) distance=SparseEuclidianDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') sparse_euclidian_distance()
# An approach as applied below, which shows the processing of input data # from a file becomes a crucial factor for writing your own sample applications. # This approach is just one example of what can be done using the distance # functions provided by shogun. # # First, you need to determine what type your data will be, because this # will determine the distance function you can use. # # This example loads two stored matrices of real values from different # files and initializes the matrices to 'RealFeatures'. # Each column of the matrices corresponds to one data point. # # The distance initialized by two data sets (the same data set as shown in the # first call) controls the processing of the given data points, where a pairwise # distance (extended Jaccard coefficient) matrix is computed by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # The method call 'init'* binds the given data sets, where a pairwise distance # (extended Jaccard coefficient) matrix between these two data sets is computed # by 'get_distance_matrix'. # # The resulting distance matrix can be reaccessed by 'get_distance_matrix'. # # *Note that the previous computed distance matrix can no longer be # reaccessed by 'get_distance_matrix'. # # For more details see doc/classshogun_1_1CTanimotoDistance.html. # # Obviously, using the Tanimoto distance/coefficient is not limited to # this showcase example. def tanimoto_distance (): print 'TanimotoDistance' from shogun.Features import RealFeatures from shogun.Distance import TanimotoDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=TanimotoDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') tanimoto_distance()
# In this example the Histogram algorithm object computes a histogram over all # 16bit unsigned integers in the features. def histogram (): print 'Histogram' from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Distribution import Histogram order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) histo=Histogram(feats) histo.train() histo.get_histogram() num_examples=feats.get_num_vectors() num_param=histo.get_num_model_parameters() #for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) histo.get_log_likelihood() histo.get_log_likelihood_sample() ########################################################################### # call functions ########################################################################### if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_dna=lm.load_dna('../data/fm_train_dna.dat') histogram()
# In this example a hidden markov model with 3 states and 6 transitions is trained # on a string data set. After calling the constructor of the HMM class specifying # the number of states and transitions the model is trained. Via the Baum-Welch # algorithm the optimal transition and emission probabilities are estimated. The # best path, i.e. the path with highest probability given the model can then be # calculated using get_best_path_state. def hmm (): print 'HMM' from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE from shogun.Distribution import HMM, BW_NORMAL N=3 M=6 pseudo=1e-1 order=1 gap=0 reverse=False num_examples=2 charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in xrange(num_examples): for j in xrange(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in xrange(num_examples): best_path+=hmm.best_path(i) for j in xrange(N): best_path_state+=hmm.get_best_path_state(i, j) hmm.get_log_likelihood() hmm.get_log_likelihood_sample() ########################################################################### # call functions ########################################################################### if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_cube=lm.load_cubes('../data/fm_train_cube.dat') hmm()
# Trains an inhomogeneous Markov chain of order 3 on a DNA string data set. Due to # the structure of the Markov chain it is very similar to a HMM with just one # chain of connected hidden states - that is why we termed this linear HMM. def linear_hmm (): print 'LinearHMM' from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Distribution import LinearHMM order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in xrange(num_examples): for j in xrange(num_param): hmm.get_log_derivative(j, i) hmm.get_log_likelihood() hmm.get_log_likelihood_sample() ########################################################################### # call functions ########################################################################### if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_dna=lm.load_dna('../data/fm_train_dna.dat') linear_hmm()
# This example shows how to read and write plain ascii files, binary files and # hdf5 datasets. # # For ascii files it shows how to obtain shogun's RealFeatures # (a simple feature matrix of doubles with 1 column == 1 example, nr_columns == # number of examples) and also sparse features in SVM light format. # # Binary files use some custom native format and datasets can be read/written # from/to hdf5 files with arbitrary group / path. def io (): print 'Features IO' import numpy from shogun.Features import SparseRealFeatures, RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Library import AsciiFile, BinaryFile, HDF5File feats=SparseRealFeatures(fm_train_real) feats2=SparseRealFeatures() f=BinaryFile("fm_train_sparsereal.bin","w") feats.save(f) f=AsciiFile("fm_train_sparsereal.ascii","w") feats.save(f) f=BinaryFile("fm_train_sparsereal.bin") feats2.load(f) f=AsciiFile("fm_train_sparsereal.ascii") feats2.load(f) feats=RealFeatures(fm_train_real) feats2=RealFeatures() f=BinaryFile("fm_train_real.bin","w") feats.save(f) f=HDF5File("fm_train_real.h5","w", "/data/doubles") feats.save(f) f=AsciiFile("fm_train_real.ascii","w") feats.save(f) f=BinaryFile("fm_train_real.bin") feats2.load(f) print "diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())) f=AsciiFile("fm_train_real.ascii") feats2.load(f) print "diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())) lab=Labels(numpy.array([1.0,2.0,3.0])) lab2=Labels() f=AsciiFile("label_train_twoclass.ascii","w") lab.save(f) f=BinaryFile("label_train_twoclass.bin","w") lab.save(f) f=HDF5File("fm_train_real.h5","a", "/data/labels") lab.save(f) f=AsciiFile("label_train_twoclass.ascii") lab2.load(f) f=BinaryFile("label_train_twoclass.bin") lab2.load(f) f=HDF5File("fm_train_real.h5","r", "/data/doubles") feats2.load(f) print feats2.get_feature_matrix() f=HDF5File("fm_train_real.h5","r", "/data/labels") lab2.load(f) print lab2.get_labels() #clean up import os for f in ['fm_train_sparsereal.bin','fm_train_sparsereal.ascii', 'fm_train_real.bin','fm_train_real.h5','fm_train_real.ascii', 'label_train_twoclass.ascii','label_train_twoclass.bin']: os.unlink(f) if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') label_train_twoclass=lm.load_numbers('../data/label_train_twoclass.dat') io()
# This example demonstrates how to read and write data in the SVMLight Format # from Shogun. # import os from shogun.Features import SparseRealFeatures f=SparseRealFeatures() lab=f.load_svmlight_file('../data/train_sparsereal.light') f.write_svmlight_file('testwrite.light', lab) os.unlink('testwrite.light')
# This example demonstrates how to encode small positive natural numbers # (up to 255) in shogun using ByteFeatures. from shogun.Features import ByteFeatures from numpy import array, uint8, all # create dense matrix A A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Byte a=ByteFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=uint8), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
# This example demonstrates, how to encode features composed of 64bit Integers in Shogun # using LongIntFeatures. from shogun.Features import LongIntFeatures from numpy import array, int64, all # create dense matrix A A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) # ... of type LongInt a=LongIntFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=int64), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
# This example shows how to encode features that live in various vector spaces # using the appropriate shogun objects. We demonstrate how to use # three types of features: ByteFeatures (small integer values), # LongIntFeatures (large integer values) and finally RealFeatures # (real-valued vectors). from shogun.Features import RealFeatures, LongIntFeatures, ByteFeatures from numpy import array, float64, int64, uint8, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) B=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) C=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Real, LongInt and Byte a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A)) print type(b_out), b_out.dtype print b_out assert(all(b_out==B)) print type(c_out), c_out.dtype print c_out assert(all(c_out==C))
# This examples demonstrates how to encode real-valued features in Shogun, # using RealFeatures. from shogun.Features import RealFeatures from numpy import array, float64, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) # ... of type Real, LongInt and Byte a=RealFeatures(A) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() print type(a_out), a_out.dtype print a_out assert(all(a_out==A))
# Creates features similar to the feature space of the SNP kernel. Useful when # working with linear methods. from shogun.Features import * from numpy import * sf=StringByteFeatures(DIGIT2) sf.load_ascii_file('x', False, DIGIT2, DIGIT2) print sf.get_features() snps=SNPFeatures(sf) print snps.get_feature_matrix() print snps.get_minor_base_string() print snps.get_major_base_string()
# This example demsonstrates how to encode sparse (most entries zero), # real-valued features in shogun using SparseRealFeatures. from scipy.sparse import csc_matrix from shogun.Features import SparseRealFeatures from numpy import array, float64, all # create dense matrix A and its sparse representation X # note, will work with types other than float64 too, # but requires recent scipy.sparse A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) X=csc_matrix(A) print A # create sparse shogun features from dense matrix A a=SparseRealFeatures(A) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) print a_out # create sparse shogun features from sparse matrix X a.set_sparse_feature_matrix(X) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) # create sparse shogun features from sparse matrix X a=SparseRealFeatures(X) a_out=a.get_full_feature_matrix() print a_out assert(all(a_out==A)) # obtain (data,row,indptr) csc arrays of sparse shogun features z=csc_matrix(a.get_sparse_feature_matrix()) z_out=z.todense() print z_out assert(all(z_out==A))
# This example demonstrates how to use compressed strings with shogun. # We currently support reading and writing compressed files using # LZO, GZIP, BZIP2 and LZMA. Furthermore, we demonstrate how to extract # compressed streams on-the-fly in order to fit data sets into # memory that would be too large, otherwise. # from shogun.Features import StringCharFeatures, StringFileCharFeatures, RAWBYTE from shogun.Library import UNCOMPRESSED,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG from shogun.PreProc import DecompressCharString f=StringFileCharFeatures('features_string_char_compressed_modular.py', RAWBYTE) print "original strings", f.get_features() #uncompressed f.save_compressed("foo_uncompressed.str", UNCOMPRESSED, 1) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_uncompressed.str", True) print "uncompressed strings", f2.get_features() print # load compressed data and uncompress on load #lzo f.save_compressed("foo_lzo.str", LZO, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzo.str", True) print "lzo strings", f2.get_features() print ##gzip f.save_compressed("foo_gzip.str", GZIP, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_gzip.str", True) print "gzip strings", f2.get_features() print #bzip2 f.save_compressed("foo_bzip2.str", BZIP2, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_bzip2.str", True) print "bzip2 strings", f2.get_features() print #lzma f.save_compressed("foo_lzma.str", LZMA, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzma.str", True) print "lzma strings", f2.get_features() print # load compressed data and uncompress via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzo.str", False) f2.add_preproc(DecompressCharString(LZO)) f2.apply_preproc() print "lzo strings", f2.get_features() print # load compressed data and uncompress on-the-fly via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzo.str", False) f2.io.set_loglevel(MSG_DEBUG) f2.add_preproc(DecompressCharString(LZO)) f2.enable_on_the_fly_preprocessing() print "lzo strings", f2.get_features() print #clean up import os for f in ['foo_uncompressed.str', 'foo_lzo.str', 'foo_gzip.str', 'foo_bzip2.str', 'foo_lzma.str', 'foo_lzo.str', 'foo_lzo.str']: if os.path.exists(f): os.unlink(f) ########################################################################################## # some perfectly compressible stuff follows ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ########################################################################################## ##########################################################################################
# This example demonstrates how to encode ASCII-strings (255 symbols) in shogun. from shogun.Features import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(['hey','guys','i','am','a','string'], RAWBYTE) #and output several stats print "max string length", f.get_max_vector_length() print "number of strings", f.get_num_vectors() print "length of first string", f.get_vector_length(0) print "string[5]", ''.join(f.get_feature_vector(5)) print "strings", f.get_features() #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) print "strings", f.get_features()
# This example demonstrates how to load ASCII features from a file into shogun. from shogun.Features import StringFileCharFeatures, RAWBYTE f = StringFileCharFeatures('features_string_file_char_modular.py', RAWBYTE) print "strings", f.get_features()
# This example demonstrates how to load string features from files. # We cover two cases: First, we show how to obtain StringCharFeatues # from a directory of text files (particularly useful in computational biology) # and second, we demonstrate how to load StringCharFeatues from one (multi-line) file. # from shogun.Features import StringCharFeatures, RAWBYTE # load features from directory f=StringCharFeatures(RAWBYTE) f.load_from_directory(".") #and output several stats print "max string length", f.get_max_vector_length() print "number of strings", f.get_num_vectors() print "length of first string", f.get_vector_length(0) print "str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2) print "len(str[0])", f.get_vector_length(0) print "str[0]", f.get_feature_vector(0) #or load features from file (one string per line) f.load('features_string_char_modular.py') print f.get_features() #or load fasta file #f.load_fasta('fasta.fa') #print f.get_features()
# This creates a HashedWDFeatures object, i.e. an approximation to the Weighted # Degree kernel feature space via hashes. These features can be particularly fast # in linear SVM solvers. from numpy import * from shogun.Features import * from shogun.Library import MSG_DEBUG order=3 start_order=1 from_order=order hash_bits=2 x=[array([0,1,2,3,0,1,2,3,3,2,2,1,1],dtype=uint8)] print len(x[0]) f=StringByteFeatures(RAWDNA) f.io.set_loglevel(MSG_DEBUG) f.set_features(x) y=HashedWDFeatures(f,start_order,order,from_order,hash_bits) print y.get_dim_feature_space() fm=y.get_feature_matrix() print fm.shape print fm
# In this example, we demonstrate how to obtain string features # by using a sliding window in a memory-efficient way. Instead of copying # the string for each position of the sliding window, we only store a reference # with respect to the complete string. This is particularly useful, when working # with genomic data, where storing all explicitly copied strings in memory # quickly becomes infeasible. In addition to a sliding window (of a particular # length) over all position, we also support defining a custom position # list. from shogun.Features import StringCharFeatures, DNA from shogun.Library import DynamicIntArray # create string features with a single string s=10*'A' + 10*'C' + 10*'G' + 10*'T' f=StringCharFeatures([s], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) print f.get_num_vectors() print f.get_vector_length(0) print f.get_vector_length(1) print f.get_features() # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) print f.get_num_vectors() print f.get_vector_length(0) print f.get_vector_length(1) print f.get_features() # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) print f.get_features() # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) print f.get_features()
# This example demonstrates how to encode string # features efficiently by creating a more compactly encoded # bit-string from StringCharFeatures. # For instance, when working with the DNA alphabet {A,T,G,C} # using 1 char = 1 byte per symbol would be wasteful, as we # can encode 4 symbols using 2 bits only. # Here, this is done in junks of 64bit (ulong). from shogun.Features import StringCharFeatures, StringUlongFeatures, RAWBYTE from numpy import array, uint64 #create string features cf=StringCharFeatures(['hey','guys','string'], RAWBYTE) uf=StringUlongFeatures(RAWBYTE) #start=0, order=2, gap=0, rev=False) uf.obtain_from_char(cf, 0, 2, 0, False) #and output several stats print "max string length", uf.get_max_vector_length() print "number of strings", uf.get_num_vectors() print "length of first string", uf.get_vector_length(0) print "string[2]", uf.get_feature_vector(2) print "strings", uf.get_features() #replace string 0 uf.set_feature_vector(array([1,2,3,4,5], dtype=uint64), 0) print "strings", uf.get_features()
# This example demonstrates how to encode string # features efficiently by creating a more compactly encoded # bit-string from StringCharFeatures. # For instance, when working with the DNA alphabet {A,T,G,C} # using 1 char = 1 byte per symbol would be wasteful, as we # can encode 4 symbols using 2 bits only. # Here, this is done in junks of 16bit (word). from shogun.Features import StringCharFeatures, StringWordFeatures, RAWBYTE from numpy import array, uint16 #create string features cf=StringCharFeatures(['hey','guys','string'], RAWBYTE) wf=StringWordFeatures(RAWBYTE) #start=0, order=2, gap=0, rev=False) wf.obtain_from_char(cf, 0, 2, 0, False) #and output several stats print "max string length", wf.get_max_vector_length() print "number of strings", wf.get_num_vectors() print "length of first string", wf.get_vector_length(0) print "string[2]", wf.get_feature_vector(2) print "strings", wf.get_features() #replace string 0 wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0) print "strings", wf.get_features()
# This example demonstrates the use of the AUC Kernel. ########################################################################### # kernel can be used to maximize AUC instead of margin in SVMs ########################################################################### def auc (): print 'AUC' from shogun.Kernel import GaussianKernel, AUCKernel from shogun.Features import RealFeatures, Labels feats_train=RealFeatures(fm_train_real) width=1.7 subkernel=GaussianKernel(feats_train, feats_train, width) kernel=AUCKernel(0, subkernel) kernel.setup_auc_maximization( Labels(label_train_real) ) km_train=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import double lm=LoadMatrix() fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat')) label_train_real=lm.load_labels('../data/label_train_twoclass.dat') auc()
# This is an example for the initialization of the chi2-kernel on real data, where # each column of the matrices corresponds to one training/test example. ########################################################################### # chi2 kernel ########################################################################### def chi2 (): print 'Chi2' from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=1.4 size_cache=10 kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import double lm=LoadMatrix() fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat')) fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat')) chi2()
def combined_custom(): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel from shogun.Classifier import LibSVM kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(fm_train_real) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10,2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(fm_label_twoclass) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(fm_test_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(fm_test_real) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.classify() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real = lm.load_numbers('../data/fm_train_real.dat') fm_test_real = lm.load_numbers('../data/fm_test_real.dat') fm_label_twoclass = lm.load_labels('../data/label_train_twoclass.dat') combined_custom()
# This is an example for the initialization of a combined kernel, which is a weighted sum of # in this case three kernels on real valued data. The sub-kernel weights are all set to 1. # def combined(): print 'Combined' from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import double lm=LoadMatrix() fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat')) fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat')) fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') combined()
# This is an example for the initialization of the CommUlongString-kernel. This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. def comm_ulong_string (): print 'CommUlongString' from shogun.Kernel import CommUlongStringKernel from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA from shogun.PreProc import SortUlongString order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') comm_ulong_string()
# This is an example for the initialization of the CommWordString-kernel (aka # Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. def comm_word_string (): print 'CommWordString' from shogun.Kernel import CommWordStringKernel from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.PreProc import SortWordString order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') comm_word_string()
# The constant kernel gives a trivial kernel matrix with all entries set to the same value # defined by the argument 'c'. # def const (): print 'Const' from shogun.Features import DummyFeatures from shogun.Kernel import ConstKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) c=23. kernel=ConstKernel(feats_train, feats_train, c) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': const()
# A user defined custom kernel is assigned in this example, for which only the lower triangle # may be given (set_triangle_kernel_matrix_from_triangle) or # a full matrix (set_full_kernel_matrix_from_full), or a full matrix which is then internally stored as a # triangle (set_triangle_kernel_matrix_from_full). Labels for the examples are given, a svm is trained and # the svm is used to classify the examples. # def custom (): print 'Custom' from numpy.random import rand from numpy import array, float32 from shogun.Features import RealFeatures from shogun.Kernel import CustomKernel dim=7 data=rand(dim, dim) feats=RealFeatures(data) symdata=data+data.T lowertriangle=array([symdata[(x,y)] for x in xrange(symdata.shape[1]) for y in xrange(symdata.shape[0]) if y<=x]) kernel=CustomKernel() # once with float64's kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() # now once with float32's data=array(data,dtype=float32) kernel.set_triangle_kernel_matrix_from_triangle(lowertriangle) km_triangletriangle=kernel.get_kernel_matrix() kernel.set_triangle_kernel_matrix_from_full(symdata) km_fulltriangle=kernel.get_kernel_matrix() kernel.set_full_kernel_matrix_from_full(data) km_fullfull=kernel.get_kernel_matrix() if __name__=='__main__': from numpy.random import seed seed(42) custom()
# This is an example for the initialization of the diag-kernel. # The diag kernel has all kernel matrix entries but those on # the main diagonal set to zero. def diag (): print 'Diag' from shogun.Features import DummyFeatures from shogun.Kernel import DiagKernel feats_train=DummyFeatures(10) feats_test=DummyFeatures(17) diag=23. kernel=DiagKernel(feats_train, feats_train, diag) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': diag()
# With the distance kernel one can use any of the following distance metrics: # BrayCurtisDistance() # CanberraMetric() # CanberraWordDistance() # ChebyshewMetric() # ChiSquareDistance() # CosineDistance() # Distance() # EuclidianDistance() # GeodesicMetric() # HammingWordDistance() # JensenMetric() # ManhattanMetric() # ManhattanWordDistance() # MinkowskiMetric() # RealDistance() # SimpleDistance() # SparseDistance() # SparseEuclidianDistance() # StringDistance() # TanimotoDistance() # def distance (): print 'Distance' from shogun.Kernel import DistanceKernel from shogun.Features import RealFeatures from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=1.7 distance=EuclidianDistance() kernel=DistanceKernel(feats_train, feats_test, width, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import double lm=LoadMatrix() fm_train_real=double(lm.load_numbers('../data/fm_train_real.dat')) fm_test_real=double(lm.load_numbers('../data/fm_test_real.dat')) distance()
# The class FKFeatures implements Fischer kernel features obtained from # two Hidden Markov models. # # It was used in # # K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new # discriminative kernel from probabilistic models. Neural Computation, # 14:2397-2414, 2002. # # which also has the details. # # Note that FK-features are computed on the fly, so to be effective feature # caching should be enabled. # # It inherits its functionality from CSimpleFeatures, which should be # consulted for further reference. # def fisher (): print "Fisher Kernel" from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA pseudo=1e-1 order=1 gap=0 reverse=False kargs=[1, False, True] # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') fm_hmm_pos=[ fm_train_dna[i] for i in where([label_train_dna==1])[1] ] fm_hmm_neg=[ fm_train_dna[i] for i in where([label_train_dna==-1])[1] ] fisher()
# The FixedDegree String kernel takes as input two strings of same size and counts the number of matches of length d. def fixed_degree_string (): print 'FixedDegreeString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import FixedDegreeStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 kernel=FixedDegreeStringKernel(feats_train, feats_train, degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') fixed_degree_string()
# The well known Gaussian kernel (swiss army knife for SVMs) on dense real valued features. def gaussian (): print 'Gaussian' from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=1.9 kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') gaussian()
# An experimental kernel inspired by the WeightedDegreePositionStringKernel and the Gaussian kernel. # The idea is to shift the dimensions of the input vectors against eachother. 'shift_step' is the step # size of the shifts and max_shift is the maximal shift. def gaussian_shift (): print 'GaussianShift' from shogun.Features import RealFeatures from shogun.Kernel import GaussianShiftKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=1.8 max_shift=2 shift_step=1 kernel=GaussianShiftKernel( feats_train, feats_train, width, max_shift, shift_step) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') gaussian_shift()
# The HistogramWordString computes the TOP kernel on inhomogeneous Markov Chains. def plugin_estimate_histogram (): print 'PluginEstimate w/ HistogramWord' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels from shogun.Kernel import HistogramWordStringKernel from shogun.Classifier import PluginEstimate order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=Labels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.classify().get_labels() km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') plugin_estimate_histogram()
# example on saving a kernel to a file def gaussian (): print 'Gaussian' from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel from shogun.Library import AsciiFile, BinaryFile feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=1.9 kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() f=AsciiFile("gaussian_train.ascii","w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() f=AsciiFile("gaussian_test.ascii","w") kernel.save(f) del f #clean up import os os.unlink("gaussian_test.ascii") os.unlink("gaussian_train.ascii") if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') gaussian()
# This is an example for the initialization of a linear kernel on raw byte # data. ########################################################################### # linear kernel on byte features ########################################################################### def linear_byte(): print 'LinearByte' from shogun.Kernel import LinearByteKernel from shogun.Features import ByteFeatures feats_train=ByteFeatures(fm_train_byte) feats_test=ByteFeatures(fm_test_byte) kernel=LinearByteKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import ubyte lm=LoadMatrix() fm_train_byte=ubyte(lm.load_numbers('../data/fm_train_byte.dat')) fm_test_byte=ubyte(lm.load_numbers('../data/fm_test_byte.dat')) linear_byte()
# This is an example for the initialization of a linear kernel on real valued # data using scaling factor 1.2. def linear (): print 'Linear' from shogun.Features import RealFeatures from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) scale=1.2 kernel=LinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') linear()
# This is an example for the initialization of a linear kernel on string data. The # strings are all of the same length and consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. def linear_string (): print 'LinearString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import LinearStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LinearStringKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') linear_string()
# This is an example for the initialization of a linear kernel on word (2byte) # data. def linear_word (): print 'LinearWord' from shogun.Kernel import LinearWordKernel, AvgDiagKernelNormalizer from shogun.Features import WordFeatures feats_train=WordFeatures(fm_train_word) feats_test=WordFeatures(fm_test_word) scale=1.4 kernel=LinearWordKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import ushort lm=LoadMatrix() fm_train_word=ushort(lm.load_numbers('../data/fm_test_word.dat')) fm_test_word=ushort(lm.load_numbers('../data/fm_test_word.dat')) linear_word()
# This is an example for the initialization of the local alignment kernel on # DNA sequences, where each column of the matrices of type char corresponds to # one training/test example. def local_alignment_string(): print 'LocalAlignmentString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import LocalAlignmentStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LocalAlignmentStringKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') local_alignment_string()
# The LocalityImprovedString kernel is inspired by the polynomial kernel. # Comparing neighboring characters it puts emphasize on local features. # # It can be defined as # K({\bf x},{\bf x'})=\left(\sum_{i=0}^{T-1}\left(\sum_{j=-l}^{+l}w_jI_{i+j}({\bf x},{\bf x'})\right)^{d_1}\right)^{d_2}, # where # I_i({\bf x},{\bf x'})=1 # if $x_i=x'_i and 0 otherwise. # def locality_improved_string (): print 'LocalityImprovedString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import LocalityImprovedStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) length=5 inner_degree=5 outer_degree=7 kernel=LocalityImprovedStringKernel( feats_train, feats_train, length, inner_degree, outer_degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') locality_improved_string()
def match_word_string (): print 'MatchWordString' from shogun.Kernel import MatchWordStringKernel, AvgDiagKernelNormalizer from shogun.Features import StringWordFeatures, StringCharFeatures, DNA degree=3 scale=1.4 size_cache=10 order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) kernel=MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') match_word_string()
# This is an example initializing the oligo string kernel which takes distances # between matching oligos (k-mers) into account via a gaussian. Variable 'k' defines the length # of the oligo and variable 'w' the width of the gaussian. The oligo string kernel is # implemented for the DNA-alphabet 'ACGT'. # def oligo_string (): print 'OligoString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import OligoStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) k=3 width=1.2 size_cache=10 kernel=OligoStringKernel(size_cache, k, width) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') oligo_string()
def poly_match_string (): print 'PolyMatchString' from shogun.Kernel import PolyMatchStringKernel from shogun.Features import StringCharFeatures, DNA feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_train_dna, DNA) degree=3 inhomogene=False kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') poly_match_string()
# This is an example for the initialization of the PolyMatchString kernel on string data. # The PolyMatchString kernel sums over the matches of two stings of the same length and # takes the sum to the power of 'degree'. The strings consist of the characters 'ACGT' corresponding # to the DNA-alphabet. Each column of the matrices of type char corresponds to # one training/test example. def poly_match_word_string (): print 'PolyMatchWordString' from shogun.Kernel import PolyMatchWordStringKernel from shogun.Features import StringWordFeatures, StringCharFeatures, DNA degree=2 inhomogene=True order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') poly_match_word_string()
# This example initializes the polynomial kernel with real data. # If variable 'inhomogene' is 'True' +1 is added to the scalar product # before taking it to the power of 'degree'. If 'use_normalization' is # set to 'true' then kernel matrix will be normalized by the square roots # of the diagonal entries. def poly (): print 'Poly' from shogun.Features import RealFeatures from shogun.Kernel import PolyKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) degree=4 inhomogene=False use_normalization=True kernel=PolyKernel( feats_train, feats_train, degree, inhomogene, use_normalization) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') poly()
# The SalzbergWordString kernel implements the Salzberg kernel. # # It is described in # # Engineering Support Vector Machine Kernels That Recognize Translation Initiation Sites # A. Zien, G.Raetsch, S. Mika, B. Schoelkopf, T. Lengauer, K.-R. Mueller # def plugin_estimate_salzberg (): print 'PluginEstimate w/ SalzbergWord' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels from shogun.Kernel import SalzbergWordStringKernel from shogun.Classifier import PluginEstimate order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=Labels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=SalzbergWordStringKernel(feats_train, feats_test, pie, labels) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.classify().get_labels() km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') plugin_estimate_salzberg()
# The standard Sigmoid kernel computed on dense real valued features. def sigmoid (): print 'Sigmoid' from shogun.Features import RealFeatures from shogun.Kernel import SigmoidKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) size_cache=10 gamma=1.2 coef0=1.3 kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') sigmoid()
# SimpleLocalityImprovedString kernel, is a `simplified' and better performing version of the Locality improved kernel. def simple_locality_improved_string (): print 'SimpleLocalityImprovedString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import SimpleLocalityImprovedStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) length=5 inner_degree=5 outer_degree=7 kernel=SimpleLocalityImprovedStringKernel( feats_train, feats_train, length, inner_degree, outer_degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') simple_locality_improved_string()
# This example demonstrates how to use the Gaussian Kernel with sparse features. def sparse_gaussian (): print 'SparseGaussian' from shogun.Features import SparseRealFeatures from shogun.Kernel import SparseGaussianKernel feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) width=1.1 kernel=SparseGaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') sparse_gaussian()
# This example demonstrates how to use the Linear Kernel with sparse features. def sparse_linear (): print 'SparseLinear' from shogun.Features import SparseRealFeatures from shogun.Kernel import SparseLinearKernel, AvgDiagKernelNormalizer feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) scale=1.1 kernel=SparseLinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') sparse_linear()
# This example shows how to use the polynomial kernel with sparse features. def sparse_poly (): print 'SparsePoly' from shogun.Features import SparseRealFeatures from shogun.Kernel import SparsePolyKernel feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) size_cache=10 degree=3 inhomogene=True kernel=SparsePolyKernel(feats_train, feats_train, size_cache, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') sparse_poly()
# The class TOPFeatures implements TOP kernel features obtained from # two Hidden Markov models. # # It was used in # # K. Tsuda, M. Kawanabe, G. Raetsch, S. Sonnenburg, and K.R. Mueller. A new # discriminative kernel from probabilistic models. Neural Computation, # 14:2397-2414, 2002. # # which also has the details. # # Note that TOP-features are computed on the fly, so to be effective feature # caching should be enabled. # # It inherits its functionality from CSimpleFeatures, which should be # consulted for further reference. # def top(): print "TOP Kernel" from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA pseudo=1e-1 order=1 gap=0 reverse=False kargs=[1, False, True] # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=TOPFeatures(10, pos, neg, False, False) kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix from numpy import where lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') label_train_dna=lm.load_labels('../data/label_train_dna.dat') fm_hmm_pos=[ fm_train_dna[i] for i in where([label_train_dna==1])[1] ] fm_hmm_neg=[ fm_train_dna[i] for i in where([label_train_dna==-1])[1] ] top()
# The WeightedCommWordString kernel may be used to compute the weighted # spectrum kernel (i.e. a spectrum kernel for 1 to K-mers, where each k-mer # length is weighted by some coefficient \f$\beta_k\f$) from strings that have # been mapped into unsigned 16bit integers. # # These 16bit integers correspond to k-mers. To applicable in this kernel they # need to be sorted (e.g. via the SortWordString pre-processor). # # It basically uses the algorithm in the unix "comm" command (hence the name) # to compute: # # k({\bf x},({\bf x'})= \sum_{k=1}^K\beta_k\Phi_k({\bf x})\cdot \Phi_k({\bf x'}) # # where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in # \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature # vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$. # # Note that this representation is especially tuned to small alphabets # (like the 2-bit alphabet DNA), for which it enables spectrum kernels # of order 8. # # For this kernel the linadd speedups are quite efficiently implemented using # direct maps. # def weighted_comm_word_string (): print 'WeightedCommWordString' from shogun.Kernel import WeightedCommWordStringKernel from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.PreProc import SortWordString order=3 gap=0 reverse=True charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') weighted_comm_word_string()
# The Weighted Degree Position String kernel (Weighted Degree kernel with shifts). # # The WD-shift kernel of order d compares two sequences X and # Y of length L by summing all contributions of k-mer matches of # lengths k in 1...d, weighted by coefficients beta_k # allowing for a positional tolerance of up to shift s. # def weighted_degree_position_string (): print 'WeightedDegreePositionString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import WeightedDegreePositionStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) degree=20 kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, degree) #kernel.set_shifts(zeros(len(data['train'][0]), dtype=int)) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') weighted_degree_position_string()
# This examples shows how to create a Weighted Degree String Kernel from data # and how to compute the kernel matrix from the resulting object. def weighted_degree_string (): print 'WeightedDegreeString' from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import WeightedDegreeStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) #weights=arange(1,degree+1,dtype=double)[::-1]/ \ # sum(arange(1,degree+1,dtype=double)) #kernel.set_wd_weights(weights) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') weighted_degree_string()
# In this example we show how to perform Multiple Kernel Learning (MKL) # with the modular interface. First, we create a number of base kernels. # These kernels can capture different views of the same features, or actually # consider entirely different features associated with the same example # (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample). # The base kernels are then subsequently added to a CombinedKernel, which # contains a weight for each kernel and encapsulates the base kernels # from the training procedure. When the CombinedKernel between two examples is # evaluated it computes the corresponding linear combination of kernels according to their weights. # We then show how to create an MKLClassifier that trains an SVM and learns the optimal # weighting of kernels (w.r.t. a given norm q) at the same time. # Finally, the example shows how to classify with a trained MKLClassifier. # from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel from shogun.Classifier import MKLClassification def combined_custom(): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10,2)) kernel.init(feats_train, feats_train) # train mkl labels = Labels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) #2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() #w=kernel.get_subkernel_weights() #kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.classify() if __name__=='__main__': from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers('../data/fm_train_real.dat') fm_test_real = lm.load_numbers('../data/fm_test_real.dat') fm_label_twoclass = lm.load_labels('../data/label_train_twoclass.dat') fm_train_real.shape fm_test_real.shape combined_custom()
# In this example we show how to perform Multiple Kernel Learning (MKL) # with the modular interface for multi-class classification. # First, we create a number of base kernels and features. # These kernels can capture different views of the same features, or actually # consider entirely different features associated with the same example # (e.g. DNA sequences = strings AND gene expression data = real values of the same tissue sample). # The base kernels are then subsequently added to a CombinedKernel, which # contains a weight for each kernel and encapsulates the base kernels # from the training procedure. When the CombinedKernel between two examples is # evaluated it computes the corresponding linear combination of kernels according to their weights. # We then show how to create an MKLMultiClass classifier that trains an SVM and learns the optimal # weighting of kernels (w.r.t. a given norm q) at the same time. The main difference to the binary # classification version of MKL is that we can use more than two values as labels, when training # the classifier. # Finally, the example shows how to classify with a trained MKLMultiClass classifier. # from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from shogun.Classifier import MKLMultiClass def mkl_multiclass (): print 'mkl_multiclass' width = 1.2 C = 1.2 epsilon = 1e-5 num_threads = 1 kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10,2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(label_train_multiclass) mkl = MKLMultiClass(C, kernel, labels) mkl.set_epsilon(epsilon); mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(0.001) mkl.set_mkl_norm(1.5) mkl.train() kernel.init(feats_train, feats_test) out = mkl.classify().get_labels() print out if __name__ == '__main__': from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers('../data/fm_train_real.dat') fm_test_real = lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat') mkl_multiclass()
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # LogPlusOne adds one to a dense real-valued vector and takes the logarithm of # each component of it. It is most useful in situations where the inputs are # counts: When one compares differences of small counts any difference may matter # a lot, while small differences in large counts don't. This is what this log # transformation controls for. def log_plus_one (): print 'LogPlusOne' from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import LogPlusOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=LogPlusOne() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() width=1.4 size_cache=10 kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') log_plus_one()
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # NormOne, normalizes vectors to have norm 1. def norm_one (): print 'NormOne' from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import NormOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=NormOne() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() width=1.4 size_cache=10 kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') norm_one()
# In this example a kernel matrix is computed for a given real-valued data set. # The kernel used is the Chi2 kernel which operates on real-valued vectors. It # computes the chi-squared distance between sets of histograms. It is a very # useful distance in image recognition (used to detect objects). The preprocessor # PruneVarSubMean substracts the mean from each feature and removes features that # have zero variance. def prune_var_sub_mean (): print 'PruneVarSubMean' from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import PruneVarSubMean feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=PruneVarSubMean() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() width=1.4 size_cache=10 kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') prune_var_sub_mean()
# In this example a kernel matrix is computed for a given string data set. The # CommUlongString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 64bit integers. These 64bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortUlongString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation enables # spectrum kernels of order 8 for 8bit alphabets (like binaries) and order 32 for # 2-bit alphabets like DNA. For this kernel the linadd speedups are implemented # (though there is room for improvement here when a whole set of sequences is # ADDed) using sorted lists. def sort_ulong_string (): print 'CommUlongString' from shogun.Kernel import CommUlongStringKernel from shogun.Features import StringCharFeatures, StringUlongFeatures, DNA from shogun.PreProc import SortUlongString order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') sort_ulong_string()
# In this example a kernel matrix is computed for a given string data set. The # CommWordString kernel is used to compute the spectrum kernel from strings that # have been mapped into unsigned 16bit integers. These 16bit integers correspond # to k-mers. To be applicable in this kernel the mapped k-mers have to be sorted. # This is done using the SortWordString preprocessor, which sorts the indivual # strings in ascending order. The kernel function basically uses the algorithm in # the unix "comm" command (hence the name). Note that this representation is # especially tuned to small alphabets (like the 2-bit alphabet DNA), for which it # enables spectrum kernels of order up to 8. For this kernel the linadd speedups # are quite efficiently implemented using direct maps. def sort_word_string (): print 'CommWordString' from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_dna=lm.load_dna('../data/fm_train_dna.dat') fm_test_dna=lm.load_dna('../data/fm_test_dna.dat') sort_word_string()
# In this example a kernelized version of ridge regression (KRR) is trained on a # real-valued data set. The KRR is trained with regularization parameter tau=1e-6 # and a gaussian kernel with width=0.8. The labels of both the train and the test # data can be fetched via krr.classify().get_labels(). ########################################################################### # kernel ridge regression ########################################################################### def krr (): print 'KRR' from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import KRR feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) width=0.8 kernel=GaussianKernel(feats_train, feats_train, width) tau=1e-6 labels=Labels(label_train) krr=KRR(tau, kernel, labels) krr.train(feats_train) kernel.init(feats_train, feats_test) out = krr.classify().get_labels() return out # equivialent shorter version def krr_short (): print 'KRR_short' from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import KRR width=0.8; tau=1e-6 krr=KRR(tau, GaussianKernel(0, width), Labels(label_train)) krr.train(RealFeatures(fm_train)) out = krr.classify(RealFeatures(fm_test)).get_labels() return out if __name__=='__main__': from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() fm_train=lm.load_numbers('../data/fm_train_real.dat') fm_test=lm.load_numbers('../data/fm_test_real.dat') label_train=lm.load_labels('../data/label_train_twoclass.dat') out1=krr() out2=krr_short()
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # LIBSVM. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. The labels of both the train and the test data are # fetched via svr.classify().get_labels(). # # For more details on LIBSVM solver see http://www.csie.ntu.edu.tw/~cjlin/libsvm/ . def libsvr (): print 'LibSVR' from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import LibSVR feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 tube_epsilon=1e-2 labels=Labels(label_train) svr=LibSVR(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.train() kernel.init(feats_train, feats_test) out1=svr.classify().get_labels() out2=svr.classify(feats_test).get_labels() if __name__=='__main__': from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() fm_train=lm.load_numbers('../data/fm_train_real.dat') fm_test=lm.load_numbers('../data/fm_test_real.dat') label_train=lm.load_labels('../data/label_train_twoclass.dat') libsvr()
# In this example a support vector regression algorithm is trained on a # real-valued toy data set. The underlying library used for the SVR training is # SVM^light. The SVR is trained with regularization parameter C=1 and a gaussian # kernel with width=2.1. The the label of both the train and the test data are # fetched via svr.classify().get_labels(). # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. ########################################################################### # svm light based support vector regression ########################################################################### def svr_light (): print 'SVRLight' from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel try: from shogun.Regression import SVRLight except ImportError: print 'No support for SVRLight available.' return feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 tube_epsilon=1e-2 num_threads=3 labels=Labels(label_train) svr=SVRLight(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.parallel.set_num_threads(num_threads) svr.train() kernel.init(feats_train, feats_test) svr.classify().get_labels() if __name__=='__main__': from numpy import array from numpy.random import seed, rand from tools.load import LoadMatrix lm=LoadMatrix() fm_train=lm.load_numbers('../data/fm_train_real.dat') fm_test=lm.load_numbers('../data/fm_test_real.dat') label_train=lm.load_labels('../data/label_train_twoclass.dat') svr_light()
# This example shows how to use boost serialization (only available if the compile flag was enabled) # to serialize/deserialize an SVMLight object. Note that this code is in alpha state. from shogun.Features import * from shogun.Library import MSG_DEBUG from shogun.Features import StringCharFeatures, Labels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import * from numpy.random import randn import sys import types import random import bz2 import cPickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError, details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return cPickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError, details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = cPickle.load(f) f.close() return myobj ################################################## num=10 dist=1 width=2.1 traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); kernel.io.set_loglevel(MSG_DEBUG) labels=Labels(trainlab); svm=SVMLight(2, kernel, labels) svm.train() svm.io.set_loglevel(MSG_DEBUG) ################################################## print "labels:" print labels.to_string() print "features" print feats_train.to_string() print "kernel" print kernel.to_string() print "svm" print svm.to_string() print "#################################" fn = "serialized_svm.bz2" print "serializing SVM to file", fn save(fn, svm) print "#################################" print "unserializing SVM" svm2 = load(fn) print "#################################" print "comparing training" svm2.train() print "objective before serialization:", svm.get_objective() print "objective after serialization:", svm2.get_objective()
# In this example we use the dynamic progaramm implementation with a # gene finding specific model. The model and the training parameter # are stored in a file and are used to create a gene prediction on # some example sequence. #!/usr/bin/env python # -*- coding: utf-8 -*- from shogun.Structure import * import numpy from numpy import array,Inf,float64,matrix,frompyfunc,zeros #from IPython.Shell import IPShellEmbed #ipshell = IPShellEmbed() import gzip import scipy from scipy.io import loadmat import pickle try: from cStringIO import StringIO except ImportError: from StringIO import StringIO if scipy.__version__ >= '0.7.0': renametable = { 'scipy.io.mio5': 'scipy.io.matlab.mio5', 'scipy.sparse.sparse' : 'scipy.sparse', } else: renametable = {} def mapname(name): if name in renametable: return renametable[name] return name def mapped_load_global(self): module = mapname(self.readline()[:-1]) name = mapname(self.readline()[:-1]) klass = self.find_class(module, name) self.append(klass) def loads(str): file = StringIO(str) unpickler = pickle.Unpickler(file) unpickler.dispatch[pickle.GLOBAL] = mapped_load_global return unpickler.load() def run_test(): data_dict = loads(gzip.GzipFile('../data/DynProg_example_py.pickle.gz').read()) #data_dict = loadmat('../data/DynProg_example_py.dat.mat', appendmat=False, struct_as_record=False) #print data_dict #print len(data_dict['penalty_array'][0][0][0][0].limits[0]) num_plifs,num_limits = len(data_dict['penalty_array']),len(data_dict['penalty_array'][0].limits) pm = PlifMatrix() pm.create_plifs(num_plifs,num_limits) ids = numpy.array(range(num_plifs),dtype=numpy.int32) min_values = numpy.array(range(num_plifs),dtype=numpy.float64) max_values = numpy.array(range(num_plifs),dtype=numpy.float64) all_use_cache = numpy.array(range(num_plifs),dtype=numpy.bool) all_use_svm = numpy.array(range(num_plifs),dtype=numpy.int32) all_limits = zeros((num_plifs,num_limits)) all_penalties = zeros((num_plifs,num_limits)) all_names = ['']*num_plifs all_transforms = ['']*num_plifs for plif_idx in range(num_plifs): ids[plif_idx] = data_dict['penalty_array'][plif_idx].id-1 min_values[plif_idx] = data_dict['penalty_array'][plif_idx].min_value max_values[plif_idx] = data_dict['penalty_array'][plif_idx].max_value all_use_cache[plif_idx] = data_dict['penalty_array'][plif_idx].use_cache all_use_svm[plif_idx] = data_dict['penalty_array'][plif_idx].use_svm all_limits[plif_idx] = data_dict['penalty_array'][plif_idx].limits all_penalties[plif_idx] = data_dict['penalty_array'][plif_idx].penalties all_names[plif_idx] = str(data_dict['penalty_array'][plif_idx].name) all_transforms[plif_idx] = str(data_dict['penalty_array'][plif_idx].transform) if all_transforms[plif_idx] == '[]': all_transforms[plif_idx] = 'linear' pm.set_plif_ids(ids) pm.set_plif_min_values(min_values) pm.set_plif_max_values(max_values) pm.set_plif_use_cache(all_use_cache) pm.set_plif_use_svm(all_use_svm) pm.set_plif_limits(all_limits) pm.set_plif_penalties(all_penalties) #pm.set_plif_names(all_names) #pm.set_plif_transform_type(all_transforms) transition_ptrs = data_dict['model'].transition_pointers transition_ptrs = transition_ptrs[:,:,0:2] transition_ptrs = transition_ptrs.astype(numpy.float64) pm.compute_plif_matrix(transition_ptrs) # init_dyn_prog num_svms = 8 dyn = DynProg(num_svms) orf_info = data_dict['model'].orf_info orf_info = orf_info.astype(numpy.int32) num_states = orf_info.shape[0] dyn.set_num_states(num_states) block = data_dict['block'] seq_len = len(block.seq) seq = str(block.seq) gene_string = array([elem for elem in seq]) # precompute_content_svms pos = block.all_pos-1 pos = pos.astype(numpy.int32) snd_pos = pos dyn.set_pos(pos) dyn.set_gene_string(gene_string) dyn.create_word_string() dyn.precompute_stop_codons() dyn.init_content_svm_value_array(num_svms) dict_weights = data_dict['content_weights'] dict_weights = dict_weights.reshape(8,1).astype(numpy.float64) dict_weights = zeros((8,5440)) dyn.set_dict_weights(dict_weights.T) dyn.precompute_content_values() dyn.init_mod_words_array(data_dict['model'].mod_words.astype(numpy.int32)) pm.compute_signal_plifs(data_dict['state_signals'].astype(numpy.int32)) dyn.set_orf_info(orf_info) # p = data_dict['model'].p q = data_dict['model'].q dyn.set_p_vector(p) dyn.set_q_vector(q) a_trans = data_dict['a_trans'] a_trans = a_trans.astype(float64) dyn.set_a_trans_matrix(a_trans) dyn.check_svm_arrays() features = data_dict['block'].features dyn.set_observation_matrix(features) dyn.set_content_type_array(data_dict['seg_path'].astype(numpy.float64)) dyn.best_path_set_segment_loss(data_dict['loss'].astype(numpy.float64)) use_orf = True feat_dims = [25,201,2] dyn.set_plif_matrices(pm); dyn.compute_nbest_paths(features.shape[2], use_orf, 1,True,False) # fetch results states = dyn.get_states() print states scores = dyn.get_scores() print scores positions = dyn.get_positions() print positions if __name__ == '__main__': run_test()
import gc from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA from shogun.PreProc import SortWordString, MSG_DEBUG from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT'] NEG=[100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'TTGT', 100*'TTGT', 100*'TTGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT'] order=7 gap=0 reverse=False for i in xrange(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=mat(spec.get_kernel_matrix()) del POS del NEG del order del gap del reverse