# ==============================================================================
# -----------------
# Build the dataset
import numpy as N
import datamind.data as data
d=data.twoclassses(nbSamples=34,deltaMeans=N.arange(1,0,-.1),nbNoize=100)
Y=N.asarray(d[:,["class"]])
X=N.asarray(d[:,1:])
    
# -----------------
print "First univariate look of the data **********************************"
import datamind.ml.dimred as DR
univ=DR.UnivOneWayAnovaFstat()
print univ.summary(X,Y,plot=True)[:20]
import pylab as PL
PL.show()

# -----------------
# Configure feature selection & classifier
# Univariate feature selection based on Fstat ranking take 5 bests
import datamind.ml.dimred as DR
dimRed=DR.UnivOneWayAnovaFstat(dim=5)
# LDA classifier
import datamind.ml.classif as CLF
clf=CLF.LDA(priors=[.5,.5])
# Leave-One-Out CV
import datamind.ml.resampling as resample
cv=resample.LOO(Y)
# loss function (zero/one accuracy)
import datamind.ml.func as F

print "************************************************************************"
print "LOO  take only the five best ranked ************************************"
results=[]
for trainIndexes,testIndexes in cv:
    # Split X,Y in train test
    Xtrain,Xtest,Ytrain,Ytest=\
        resample.splitTrainTest(trainIndexes,testIndexes,X,Y)
    # Rank features based on train data
    dimRed.fit(Xtrain,Ytrain)
    # Select 5 best ranked features on train & test
    Xtrain_r=dimRed.reduce(Xtrain)
    Xtest_r=dimRed.reduce(Xtest)
    # Learn LDA on the selected features of the training data
    clf.fit(Xtrain_r, Ytrain)
    # Predict LDA on the selected features of the test data
    Ypred=clf.predict(Xtest_r)
    # Compute the accuracy
    testAccuracy=F.zeroOne_acc(Ytest,Ypred)
    print "Selected features:",dimRed.getSelectedFeatures()
    results.append(testAccuracy)

print "Accuracy --------------"
print results

print "Average accuracy ------"
print N.mean(results)

print "************************************************************************"
print "LOO same thing, but with increasing dimension from 1 to 5 **************"   
    # Do the classification for dimensions [1,2,3,4,5]
dimensions=[1,2,3,4,5]
# re-init CV
cv=resample.LOO(Y)
res=[]
for trainIndexes,testIndexes in cv:
    # Split X,Y in train test
    Xtrain,Xtest,Ytrain,Ytest=\
        resample.splitTrainTest(trainIndexes,testIndexes,X,Y)
    # Rank features based on train data
    dimRed.fit(Xtrain,Ytrain)
    for dim in dimensions:
        dimRed.setParams(dim=dim)
        # Select dim best ranked features on train & test
        Xtrain_r=dimRed.reduce(Xtrain)
        Xtest_r=dimRed.reduce(Xtest)
        # Learn LDA on the selected features of the training data
        clf.fit(Xtrain_r, Ytrain)
        # Predict LDA on the selected features of the test data
        Ypred=clf.predict(Xtest_r)
        # Compute the accuracy
        testAccuracy=F.zeroOne_acc(Ytest,Ypred)
        # Do the same on train data
        trainAccuracy=F.zeroOne_acc(Ytrain,clf.predict(Xtrain_r))
        
        # Store results indexed by testIndexes
        # Add a arbitrary number of keyword argument
        res.append([testIndexes[0],
                    dim,testAccuracy,trainAccuracy,
                    dimRed.getSelectedFeatures()])

from datamind.core import DF
d=DF(res,colnames=["fold","dim","testAccuracy","trainAccuracy","features"])
# d contains:
#fold    dim     testAccuracy    trainAccuracy   features
#0.0     1.0     0.0             0.757575757576  [0]
#0.0     2.0     1.0             0.818181818182  [0 1]
#0.0     3.0     0.0             0.909090909091  [ 0  1 42]
#0.0     4.0     1.0             0.909090909091  [ 0  1 42  9]
#0.0     5.0     1.0             0.939393939394  [ 0  1 42  9 23]
#....
#33.0    1.0     1.0             0.787878787879  [1]
#33.0    2.0     1.0             0.818181818182  [1 0]
#33.0    3.0     1.0             0.939393939394  [ 1  0 23]
#33.0    4.0     1.0             0.939393939394  [ 1  0 23 42]
#33.0    5.0     1.0             0.939393939394  [ 1  0 23 42  2]

# Mean testAccuracy by dimension
import datamind.ml.tools as tools
print tools.stats_array(d,fields=["testAccuracy","trainAccuracy"],by="dim")
# look like:
#dim     testAccuracy    trainAccuracy
#1.0     0.588235294118  0.764705882353
#2.0     0.823529411765  0.823529411765
#3.0     0.823529411765  0.927807486631
#4.0     0.794117647059  0.940285204991
#5.0     0.823529411765  0.941176470588