import java.util.*;
import Jama.*;

public class DataProcessor{

    public ArrayList<double[][]> splitTrainTest(double[][] dataset,
						int testStartIDx, int testEndIDx){
	
	ArrayList<double[][]> output = new ArrayList<double[][]>();
	int testRecordCount = testEndIDx  - testStartIDx;
	int trainRecordCount = dataset.length - testRecordCount;
	double[][] testset =  new double[testRecordCount][dataset[0].length];
	double[][] trainset = new double[trainRecordCount][dataset[0].length];
	setFoldData(testset, dataset, testStartIDx);
	int testRowCounter = 0; int trainRowCounter = 0;
	for(int rowCounter = 0 ; rowCounter < dataset.length; rowCounter++){
	    for(int colCounter = 0; colCounter < dataset[0].length; colCounter++){
		if((rowCounter >= testStartIDx) && (rowCounter < testEndIDx)){
		    testset[testRowCounter][colCounter] = 
			dataset[rowCounter][colCounter];
		} else{
		    trainset[trainRowCounter][colCounter] = 
			dataset[rowCounter][colCounter];
		}// if condition ends here
	    }// column loop
	    if((rowCounter >= testStartIDx) && (rowCounter <= testEndIDx)){
		testRowCounter++;
	    } else {
		trainRowCounter++;
	    }
	}// rows loop
	output.add(trainset);
	output.add(testset);
	return output;
    }

    private void setFoldData(double[][] fold, double[][] dataset, int startIDx){
	if(dataset.length == 0) return ;
	int targetCount = startIDx + fold.length; int foldRowCounter = 0;
	for(int rowCounter = startIDx ; rowCounter < targetCount; rowCounter++){
	    for(int colCounter = 0; colCounter < dataset[0].length;colCounter++){
		fold[foldRowCounter][colCounter] = 
		    dataset[rowCounter][colCounter];
	    }
	    foldRowCounter++;
	}
    }


    /* applies the given type of normalization to the dataset in place
     supports Shift and scale(0) and Zero Mean, Unit Variance(1)
    */
    public void normalizeNumericCols(double[][] dataset, int[] numericIDXs,
				     int normalizationType) {
	
	if(normalizationType == 0){
	    shiftScaleNormalization(dataset, numericIDXs);
	}
	
    }

    private void shiftScaleNormalization(double[][] dataset, int[] numericIDXs){
	double[] minValues = new double[numericIDXs.length];
	double[] maxValues = new double[numericIDXs.length];
	//initializa values
	for(int colIDx =0; colIDx < numericIDXs.length; colIDx++){
	    minValues[colIDx] = dataset[0][numericIDXs[colIDx]];
	    maxValues[colIDx] = dataset[0][numericIDXs[colIDx]];
	}
	// iterate over rest of the dataset to get min and max values
	for(int rowIDx = 1; rowIDx < dataset.length; rowIDx++){
	    for(int colIDx = 0 ; colIDx < numericIDXs.length; colIDx++){
		double value = dataset[rowIDx][numericIDXs[colIDx]];
		if(value < minValues[colIDx])
		    minValues[colIDx] = value;
		if(value > maxValues[colIDx])
		    maxValues[colIDx] = value;
	    }// column loop	    
	}// row loop
	
	// update values
	for(int rowIDx = 1; rowIDx < dataset.length; rowIDx++){
	    for(int colIDx = 0 ; colIDx < numericIDXs.length; colIDx++){
		double oldVal = dataset[rowIDx][numericIDXs[colIDx]];
		double newVal = oldVal;
		if(maxValues[colIDx] > 0)
		    newVal = (oldVal - minValues[colIDx]) / maxValues[colIDx];
		dataset[rowIDx][numericIDXs[colIDx]] = newVal;
	    }// column loop	    
	}// row loop
	Log.write("Dataset normalized..");
    }

    /* Converts the numeric columns to nominal by threshold mechanism
     updates to dataset are applied in place
    */
    public void numericToNominal(double[][] dataset, int[] numericIDXs) {
	
	
    }
    
    /*
     Given a dataset as 2D array and label column index, returns feature Matrix
     */
    public Matrix getFeatureMatrix(double[][] dataset, int labelColIDx){
	double[][] featureArray = get2DFeatureArray(dataset, labelColIDx);
	return new Matrix(featureArray);
    }

    public Matrix getLabelVector(double[][] dataset, int labelColIDx){
	double[] labelVector = getLabelArray(dataset, labelColIDx);
	return new Matrix(labelVector, labelVector.length);
    }

    // computer weight vector using formula 
    // w = (Inverse of [(Xt . X)]). ([Xt . Y])
    public Matrix computeWeightVector(Matrix X, Matrix Y){
	int xRowCount =  X.getRowDimension();
	int xColCount = X.getColumnDimension();
	Matrix Xt = X.transpose();
	Matrix p1 = Xt.times(X);
	Matrix p1Inv = p1.inverse();
	Matrix p2 = Xt.times(Y);
	Matrix output = p1Inv.times(p2);
	//output.print(4, 2);
	return output;
	
    }

    private double[][] get2DFeatureArray(double[][] dataset, int labelColIDx){
	if(dataset.length == 0) return null;
	double[][] featureArray = 
	    new double[dataset.length][dataset[0].length-1];
	for(int rowIDx = 0; rowIDx< dataset.length; rowIDx++){
	    int newColIDx = 0;
	    for(int colIDx = 0; colIDx < dataset[0].length-1 ; colIDx++){
		if(colIDx == labelColIDx) continue;//skip label copying
		featureArray[rowIDx][newColIDx] = dataset[rowIDx][colIDx];
		newColIDx++;
	    }
	}
	return featureArray;
    }

    public double[] getLabelArray(double[][] dataset, int labelColIDx){
	if(dataset.length == 0) return null;
	double[] labelVector = new double[dataset.length];
	for(int rowIDx = 0; rowIDx< dataset.length; rowIDx++){
	    for(int colIDx = 0; colIDx < dataset[0].length ; colIDx++){
		if(colIDx == labelColIDx) 
		    labelVector[rowIDx] = dataset[rowIDx][colIDx];
	    }
	}
	return labelVector;
    }
    
    public double[][] computePrediction(Matrix featureMatrix, Matrix weightVector){
	Matrix preds = featureMatrix.times(weightVector);
	double[][] predArray = preds.getArray();
	return predArray;
    }

    public double computeMSE(double[][] predictions, double[] labelArray){
	if(predictions.length == 0) return 0.0;// no data, no error
	double sumOfSquareDiff = 0.0; 
	for(int rowCounter = 0; rowCounter < predictions.length; rowCounter++){
	    double diff = predictions[rowCounter][0] - labelArray[rowCounter];
	    sumOfSquareDiff += (diff * diff);
	}
	return (sumOfSquareDiff / predictions.length);
    }


}

