import string
import numpy as np
from math import log
from numpy import * 
from numpy.linalg import *
# load txt
def getfeature(s,i):
    return s[i]
class Dtreenode():
    
    def __init__(self, data, level):
        self.data = data
        self.error = self.culerror(self.data)
        self.level = level
        self.the = -1
        self.feature = -1
        self.Rnode = Dtree()
        self.Lnode = Dtree()
        
    def culerror(self, data):
        y = data[:,-1:]
        row, col = y.shape
        sum_y = 0
        count = 0
        for i in range(row):
            sum_y += y[i]
            count += 1
        y_a = sum_y/count
        result = 0
        for i in range(row):
            delta = y_a - y[i]
            delta = delta**2
            result += delta
        return result
    

class Dtree():
    def __init__(self, root = 0):
        self.root = root
        
    def test(self, x):        

        if((self.root.the == -1) and (self.root.feature == -1)):
            return self.root.data[0,-1]

        if(x[self.root.feature] > self.root.the):
            #print "herh1"
            if(self.root.Lnode.root == 0):
                return returnavr(self.root.data)
            else:
                return self.root.Lnode.test(x)
        else:
            if(self.root.Rnode.root == 0):
                return returnavr(self.root.data)
            else:
                return self.root.Rnode.test(x)
            
    def training(self):
        
        #data = np.array([map(float, line.strip().split(',')) for line in open("2.txt","r").readlines()])
        data = loadtxt('1.txt',dtype=float)
        f = open("out.txt", "w")
        self.createtree(data, f,0)
        f.close()
        
    def testing(self):
        #data = np.array([map(float, line.strip().split(',')) for line in open("2test.txt","r").readlines()])
        data = loadtxt('1test.txt',dtype = float)
        y = data[:,-1:]
        x = data[:,0:-1]
        row,col= x.shape
        count = 0
        res = 0
        
        for i in range(row):
            y_t = self.test(x[i])
            delta_y = y_t - y[i]
            delta_y = delta_y**2
            res += delta_y
            count += 1
        return res/count
    
    def createtree(self,data,f,level):
       
        treenode = Dtreenode(data,level)
        for i in range(level):
            print >> f, "\t" ,
            print "\t",
        print >> f, "level: %3d error: %0.2f " % (level, treenode.error) 
        print treenode.error
        if self.root is 0:
            self.root = treenode

        if (self.root.error <= 100): 
            return         
        feature,the = bestsplit(self.root.data)
        self.root.feature = feature
        self.root.the = the

        dataL,dataR = splitdata(self.root.data,feature, the)
        treenode.Lnode.createtree(dataL,f,self.root.level + 1)
        treenode.Rnode.createtree(dataR,f,self.root.level + 1)
        return 
      
def returnavr(data):
        y = data[:,-1:]
        row, col = y.shape
        sum_y = 0
        count = 0
        for i in range(row):
            sum_y += y[i]
            count += 1
        y_a = sum_y/count 
        return y_a          
        
def culerror(data):
        y = data[:,-1:]
        row, col = y.shape
        sum_y = 0
        count = 0
        for i in range(row):
            sum_y += y[i]
            count += 1
        if (count == 0): 
            return 0 
        y_a = sum_y/count
        result = 0
        for i in range(row):
            delta = y_a - y[i]
            delta = delta**2
            result += delta
        return result
    
def splitdata(data,feature,the):
    row, col =  data.shape
    dataL = np.array([])
    dataR = np.array([])
    for i in range(row):
        if(data[i,feature]>the):
            dataL = append(dataL, data[i])
        else:
            dataR = append(dataR, data[i])
                
    rowL, = dataL.shape
    rowR, = dataR.shape
    dataL = dataL.reshape(rowL/col,col)
    dataR = dataR.reshape(rowR/col,col)
    return dataL, dataR                
    
def bestsplit(data):
    tem = data
    row, col = tem.shape
    if(row == 2):
        return (0,tem[0,0])
    min_error = 1000000000000000000000000000000
    dataL = np.array([])
    dataR = np.array([])
    bestf = 0
    bestt = 0
    for feature in range(col - 1):
        tem1 = tem[:,feature]
        tem1 = sort(tem1)
        for the in tem1[0:-1]:
            for i in range(row):
                if(tem[i,feature]> the):
                    dataL = append(dataL, tem[i])
                else:
                    dataR = append(dataR, tem[i])

            rowL, = dataL.shape
            rowR, = dataR.shape                
            dataL = dataL.reshape(rowL/col,col)
            dataR = dataR.reshape(rowR/col,col)
            a = culerror(dataL) + culerror(dataR)
            if(a < min_error):
                min_error = a
                bestf = feature
                bestt = the
            dataL = np.array([])
            dataR = np.array([])
    return bestf, bestt
        


T = Dtree()
T.training()
print T.testing()
 


            
             

