'''
@author: Kechen Qin

Please check out our paper and the project website:

https://arxiv.org/abs/1705.05039
http://www.ccs.neu.edu/home/kechenqin/paper/acl2017.html

Please install the follwing packages:
lp_solve: http://lpsolve.sourceforge.net/5.5/Python.htm
numpy
sklearn

Please implement your own feature functions and input function.

'''

import numpy as np
import random
import copy
from sklearn import metrics
from lp_solve import *
from lp_maker import lp_maker

class Samplerank(object):
    '''
    classdocs
    '''


    def __init__(self, data, train_index, num_epochs, num_samples):
        '''
        Constructor
        '''
        self.data = data
        self.train_index = train_index
        self.num_epochs = num_epochs
        self.num_samples = num_samples
        self.early_stop_threshold = 0.01
        
        
    def _initialize_theta(self, d):
        return [random.uniform(-1,1) for i in range(d)]
    
    def _initialize_discourse(self, d):
        return [random.randint(0,len(global_params.relation_tags)-1) for i in range(d)]
    
    def sample_discourse(self, pre_s, cur_X):
        '''Sample discourse based on previous discourse'''
    
    def get_regular_scoring(self, gold_standard, prediction, metric):
        if metric=='acc':
            return metrics.accuracy_score(gold_standard, prediction)
        elif metric=='f1':
            return np.mean(metrics.f1_score(gold_standard, prediction, average=None))
        elif metric=='auc':
            return metrics.roc_auc_score(gold_standard, prediction)
        
    def get_learning_rate(self):
        return 0.01
    
    def metropolis_hastings(self, cur_feature, pre_feature, theta):
        '''mtropolis hastings algorithm is used to determine whether accepts the new samples'''
        cur_l = self.log_linear_model.cal_scoring_function(theta, cur_feature)
        pre_l = self.log_linear_model.cal_scoring_function(theta, pre_feature)
        ratio = np.abs(np.exp(cur_l)/np.exp(pre_l))
        if ratio>1:
            return True
        else:
            r = random.random()
            if r<=ratio:
                return True
            else:
                return False
    
    def samplerank_discourse_main(self):
        '''This is an example of how Samplerank algorithm works.
        For simplicity, I only show how to estimate parameters for discourse prediction task. 
        theta is the model parameters that you want to estimate.
        
        get_s_feature_vector() is used to encode the input into feature vector
        '''
        
        pre_score = None
        pre_feature = None
        
        theta = self._initialize_theta('dimension of model parameters')
        
        for e in range(self.num_epochs):    # e is the index of epoch
            print 'epoch:', e 
            
            for i in range(len(self.X)):   # i is the index of input document
                if i not in self.train_index:
                    continue
                print 'document:', i
                
                cur_data = [d[i] for d in self.data[:-2]]+list(self.data[-2:])
                cur_X = self.X[i]
                gold_discourse = self.gold_discourses[i]
                n = len(cur_X)   # length of edges
                s = self._initialize_discourse(n)
    
                pre_score = self.get_regular_scoring(gold_discourse, s, 'f1')
                pre_s_f = get_s_feature_vector('your input and the previous input')
                pre_feature = pre_s_f
                for t in range(self.num_samples):   # t is index of samples
                    cur_s = self.sample_discourse(s, cur_X)
                    cur_score = self.get_regular_scoring(gold_discourse, cur_s, 'f1')
    
                    w_dif = np.abs(cur_score - pre_score)
    #                     if w_dif <= self.early_stop_threshold:
    #                         break
                    arg_max = np.argmax([pre_score, cur_score])
                    cur_s_f = get_s_feature_vector('your input and the current discourse labels')
                    cur_feature = cur_s_f
                    f_dif = np.subtract([pre_feature, cur_feature][arg_max], [pre_feature, cur_feature][np.abs(arg_max-1)])
                    
                    if self.metropolis_hastings(cur_feature, pre_feature, theta):
                        s = cur_s
                        pre_feature = cur_feature
                        pre_score = cur_score
            
                    if (np.dot(f_dif, theta) < w_dif) and (w_dif != 0):
                        theta = np.add(theta, np.multiply(self.get_learning_rate(), f_dif))
        return theta
        
class Log_linear_model_inference(object):
    '''
    This is how we iteratively infer discourse and content 
    by using dynamic programming and inter linear programming.
    
    s represents discourse and c represents content.
    global_params.num_*_features is the number of * features.
    global_params.relation_tags is the predefined discourse relations set.
    levenshtein computes edit distance between two vectors.
    '''


    def __init__(self):
        '''
        Constructor
        '''
        pass
    
    def cal_scoring_function(self, weights, features):
        return np.dot(weights, features)
    

    def linear_programming(self, X, nodes, node_map, idea_candidates, heads, s, theta, clusters):
        scores = []
#         a0 = 0
        s = [len(global_params.relation_tags)]+s
        for r_id in range(len(s)):
            c_id=0
            for k,v in idea_candidates[r_id].iteritems():
                features0 = get_c_feature_vector_idea(node_map[nodes[r_id]], clusters, s[r_id], v, heads[r_id][c_id], 0, features=[], pre_c = None)
                features20 = get_c_feature_vector_idea2(node_map[nodes[r_id]], clusters, v, heads[r_id][c_id], 0, features=[], pre_c = None)
                features1 = get_c_feature_vector_idea(node_map[nodes[r_id]], clusters, s[r_id], v, heads[r_id][c_id], 1, features=[], pre_c = None)
                features21 = get_c_feature_vector_idea2(node_map[nodes[r_id]], clusters, v, heads[r_id][c_id], 1, features=[], pre_c = None)
                features0 = np.append(features0, features20)
                features1 = np.append(features1, features21)
                score0 = self.cal_scoring_function(theta, features0)
                score1 = self.cal_scoring_function(theta, features1)
                scores.append(score1-score0)
                c_id+=1
#                 a0 += score0
        s = s[1:]
        cluster_score = self.map_idea_cluster(scores, clusters)
        lp = lp_maker(list(cluster_score), [list(np.ones(len(cluster_score)))], [-1], [1], None, None, None)
        z = lpsolve('set_binary', lp, range(1,len(cluster_score)+1))
        solve = lpsolve('solve',lp)
        obj = lpsolve('get_objective', lp)
        x = lpsolve('get_variables', lp)[0]
        lpsolve('delete_lp', lp)

        pred = self.map_cluster_idea(x, clusters, len(scores))
#         print sum([len(i) for i in heads])
        return pred
         
    def inference_joint_s(self, X, nodes, clusters, node_map, idea_candidates, heads, theta, c):
        pi = [] 
        s_theta = theta[:global_params.num_s_features]
        c_theta = theta[global_params.num_s_features:global_params.num_s_features+global_params.num_c_features]
#         c_theta2 = theta[global_params.num_s_features+global_params.num_c_features:]
        path = [np.zeros(len(X)) for i in range(len(global_params.relation_tags))]
        # initialization
        for r_index in range(len(global_params.relation_tags)):
            s_features = get_s_feature_vector_node(node_map[X[0][0]], node_map[X[0][1]], r_index, np.zeros(global_params.num_s_features))
            c_features = get_c_feature_vector_node(node_map[nodes[1]], clusters, r_index, idea_candidates[1], heads[1], c[1], np.zeros(global_params.num_c_features))
            score_s = self.cal_scoring_function(s_theta, s_features)
            score_c = self.cal_scoring_function(c_theta, c_features)
            pi.append(score_s+score_c)
        # recursion
#         dp = np.zeros(len(global_params.relation_tags)+len(idea_candidates))
        for t in range(1, len(X)):
            cur_pi = np.zeros(len(global_params.relation_tags))
            # current relation
            for j in range(len(global_params.relation_tags)):
                max_score = -1e10
                # previous relation
                for i in range(len(global_params.relation_tags)):
                    s_features = get_s_feature_vector_node(node_map[X[t][0]], node_map[X[t][1]], j, np.zeros(global_params.num_s_features), node_map[X[t-1][0]], node_map[X[t-1][1]], i)
                    c_features = get_c_feature_vector_node(node_map[nodes[t+1]], clusters, j, idea_candidates[t+1], heads[t+1], c[t+1], np.zeros(global_params.num_c_features))
                    tmp = pi[i]+self.cal_scoring_function(s_theta, s_features)+self.cal_scoring_function(c_theta, c_features)
                    if max_score < tmp:
                        max_score = tmp
                        path[j][t] = i
                cur_pi[j] = max_score
            pi = copy.deepcopy(cur_pi)
        
        return self.find_path(path, np.argmax(pi)) 
    
    def inference(self, X, node_map, theta):
        '''use dynamic programming to infer discourse without considering content'''
        pi = []
        s_theta = theta
        path = [np.zeros(len(X)) for i in range(len(global_params.relation_tags))]
        # initialization
        for r_index in range(len(global_params.relation_tags)):
            features = get_s_feature_vector_node(node_map[X[0][0]], node_map[X[0][1]], r_index, np.zeros(global_params.num_s_features))
            pi.append(self.cal_scoring_function(s_theta, features))
        # recursion
        for t in range(1, len(X)):
            cur_pi = np.zeros(len(global_params.relation_tags))
            # current relation
            for j in range(len(global_params.relation_tags)):
                max_score = -1e10
                # previous relation
                for i in range(len(global_params.relation_tags)):
                    features = get_s_feature_vector_node(node_map[X[t][0]], node_map[X[t][1]], j, np.zeros(global_params.num_s_features), node_map[X[t-1][0]], node_map[X[t-1][1]], i)
                    tmp = pi[i]+self.cal_scoring_function(s_theta, features)
                    if max_score < tmp:
                        max_score = tmp
                        path[j][t] = i
                cur_pi[j] = max_score
            pi = copy.deepcopy(cur_pi)
        
        return self.find_path(path, np.argmax(pi))  
    
    def iterative_inference(self, X, nodes, node_map, idea_candidates, heads, theta, clusters):
        '''iteratively inference'''
        # split the model parameters into two parts for discourse and content separately.
        s_theta = theta[:global_params.num_s_features]
        c_theta = theta[global_params.num_s_features:]
        # first infer discourse without considering content
        s0 = self.inference(X, node_map, s_theta)
        c = self.linear_programming(X, nodes, node_map, idea_candidates, heads, s0, c_theta, clusters)
        e=0
        old_s = s0
        old_c = c
        while True:
            s = self.inference_joint_s(X, nodes, clusters, node_map, idea_candidates, heads, theta, c)
            c = self.linear_programming(X, nodes, node_map, idea_candidates, heads, s, c_theta, clusters)
            distance_s = levenshtein(''.join([str(i) for i in old_s]), ''.join([str(i) for i in s]))
            distance_c = levenshtein(''.join([str(i) for i in old_c]), ''.join([str(i) for i in reshape_vectors(c)]))
            if distance_s==0 and distance_c==0: # break when the inference results stop changing
                break
            old_s = s
            old_c = c
            if e>=10: # number of iterations
                break
            e+=1
            
        return s,c
    
    def find_path(self, path, end):
        '''find out the best path based on the output of viterbi algorithm. '''
        max_path = [end]
        for index in range(-1,-len(path[0]),-1):
            end = path[int(end)][int(index)]
            max_path = [int(end)] + max_path
        
        return max_path