# ============================================================================== # ----------------- # Build the dataset import numpy as N import datamind.data as data d=data.twoclassses(nbSamples=34,deltaMeans=N.arange(1,0,-.1),nbNoize=100) Y=N.asarray(d[:,["class"]]) X=N.asarray(d[:,1:]) # ----------------- print "First univariate look of the data **********************************" import datamind.ml.dimred as DR univ=DR.UnivOneWayAnovaFstat() print univ.summary(X,Y,plot=True)[:20] import pylab as PL PL.show() # ----------------- # Configure feature selection & classifier # Univariate feature selection based on Fstat ranking take 5 bests import datamind.ml.dimred as DR dimRed=DR.UnivOneWayAnovaFstat(dim=5) # LDA classifier import datamind.ml.classif as CLF clf=CLF.LDA(priors=[.5,.5]) # Leave-One-Out CV import datamind.ml.resampling as resample cv=resample.LOO(Y) # loss function (zero/one accuracy) import datamind.ml.func as F print "************************************************************************" print "LOO take only the five best ranked ************************************" results=[] for trainIndexes,testIndexes in cv: # Split X,Y in train test Xtrain,Xtest,Ytrain,Ytest=\ resample.splitTrainTest(trainIndexes,testIndexes,X,Y) # Rank features based on train data dimRed.fit(Xtrain,Ytrain) # Select 5 best ranked features on train & test Xtrain_r=dimRed.reduce(Xtrain) Xtest_r=dimRed.reduce(Xtest) # Learn LDA on the selected features of the training data clf.fit(Xtrain_r, Ytrain) # Predict LDA on the selected features of the test data Ypred=clf.predict(Xtest_r) # Compute the accuracy testAccuracy=F.zeroOne_acc(Ytest,Ypred) print "Selected features:",dimRed.getSelectedFeatures() results.append(testAccuracy) print "Accuracy --------------" print results print "Average accuracy ------" print N.mean(results) print "************************************************************************" print "LOO same thing, but with increasing dimension from 1 to 5 **************" # Do the classification for dimensions [1,2,3,4,5] dimensions=[1,2,3,4,5] # re-init CV cv=resample.LOO(Y) res=[] for trainIndexes,testIndexes in cv: # Split X,Y in train test Xtrain,Xtest,Ytrain,Ytest=\ resample.splitTrainTest(trainIndexes,testIndexes,X,Y) # Rank features based on train data dimRed.fit(Xtrain,Ytrain) for dim in dimensions: dimRed.setParams(dim=dim) # Select dim best ranked features on train & test Xtrain_r=dimRed.reduce(Xtrain) Xtest_r=dimRed.reduce(Xtest) # Learn LDA on the selected features of the training data clf.fit(Xtrain_r, Ytrain) # Predict LDA on the selected features of the test data Ypred=clf.predict(Xtest_r) # Compute the accuracy testAccuracy=F.zeroOne_acc(Ytest,Ypred) # Do the same on train data trainAccuracy=F.zeroOne_acc(Ytrain,clf.predict(Xtrain_r)) # Store results indexed by testIndexes # Add a arbitrary number of keyword argument res.append([testIndexes[0], dim,testAccuracy,trainAccuracy, dimRed.getSelectedFeatures()]) from datamind.core import DF d=DF(res,colnames=["fold","dim","testAccuracy","trainAccuracy","features"]) # d contains: #fold dim testAccuracy trainAccuracy features #0.0 1.0 0.0 0.757575757576 [0] #0.0 2.0 1.0 0.818181818182 [0 1] #0.0 3.0 0.0 0.909090909091 [ 0 1 42] #0.0 4.0 1.0 0.909090909091 [ 0 1 42 9] #0.0 5.0 1.0 0.939393939394 [ 0 1 42 9 23] #.... #33.0 1.0 1.0 0.787878787879 [1] #33.0 2.0 1.0 0.818181818182 [1 0] #33.0 3.0 1.0 0.939393939394 [ 1 0 23] #33.0 4.0 1.0 0.939393939394 [ 1 0 23 42] #33.0 5.0 1.0 0.939393939394 [ 1 0 23 42 2] # Mean testAccuracy by dimension import datamind.ml.tools as tools print tools.stats_array(d,fields=["testAccuracy","trainAccuracy"],by="dim") # look like: #dim testAccuracy trainAccuracy #1.0 0.588235294118 0.764705882353 #2.0 0.823529411765 0.823529411765 #3.0 0.823529411765 0.927807486631 #4.0 0.794117647059 0.940285204991 #5.0 0.823529411765 0.941176470588