'''
Created May 27, 2014

@author Rainicy
'''

import numpy as np

def initialData(data):
	'''
	Description: This function split the datta to training and testing set and 
				also split the features and labels. The last column is the label.
				Besides, we need shuffle the training data order.
				Partition the data follow the instruction on website:
				http://www.ccs.neu.edu/home/jaa/CS6140.13F/Homeworks/hw.03.html

	@param:
		data: The whole dataset.
	@return:
		trainX: training data features
		trainY: training data label
		testX: test data features
		testY: test data features
	'''
	# split training data and testing data
	# index end with digit 1 is the testing set, others are training set
	testIndex = np.arange(1, data.shape[0], 10)
	testData = data[testIndex]
	trainData = np.delete(data, testIndex, 0)

	# shffule the training data
	np.random.shuffle(trainData)

	# get mean and std values
	mean = np.mean(data[:, :-1], axis = 0)
	std = np.std(data[:, :-1], axis = 0)
	# z-socre the features [(x-mean)/std]
	trainX = trainData[:, :-1]
	trainY = trainData[:, -1]
	trainX = (trainX - mean) / std
	testX = testData[:, :-1]
	testY = testData[:, -1]
	testX = (testX - mean) /std

	# add one column to the first column, which are all 1s
	trainX = np.insert(trainX, 0, 1, axis = 1)
	testX = np.insert(testX, 0, 1, axis = 1)
	return trainX, trainY, testX, testY

def RMSE(h, y):
	'''
	Description: Root Mean Squared Error(RMSE). J = sum(h - y)^2
				RMSE = sqrt(J/m). [m: #samples]
				Find more info on: 
				http://en.wikipedia.org/wiki/Root_mean_square_deviation

	@param:
		h: hypothese, calculated by (h = theta.T * X)
		y: the true label
	@return:
		RMSE: root mean Squared error
	'''
	h = np.squeeze(np.asarray(h))
	y = np.squeeze(np.asarray(y))
	J = np.sum((h - y)**2)
	return np.sqrt(J/y.size)