''' @author: Kechen Qin Please check out our paper and the project website: https://arxiv.org/abs/1705.05039 http://www.ccs.neu.edu/home/kechenqin/paper/acl2017.html Please install the follwing packages: lp_solve: http://lpsolve.sourceforge.net/5.5/Python.htm numpy sklearn Please implement your own feature functions and input function. ''' import numpy as np import random import copy from sklearn import metrics from lp_solve import * from lp_maker import lp_maker class Samplerank(object): ''' classdocs ''' def __init__(self, data, train_index, num_epochs, num_samples): ''' Constructor ''' self.data = data self.train_index = train_index self.num_epochs = num_epochs self.num_samples = num_samples self.early_stop_threshold = 0.01 def _initialize_theta(self, d): return [random.uniform(-1,1) for i in range(d)] def _initialize_discourse(self, d): return [random.randint(0,len(global_params.relation_tags)-1) for i in range(d)] def sample_discourse(self, pre_s, cur_X): '''Sample discourse based on previous discourse''' def get_regular_scoring(self, gold_standard, prediction, metric): if metric=='acc': return metrics.accuracy_score(gold_standard, prediction) elif metric=='f1': return np.mean(metrics.f1_score(gold_standard, prediction, average=None)) elif metric=='auc': return metrics.roc_auc_score(gold_standard, prediction) def get_learning_rate(self): return 0.01 def metropolis_hastings(self, cur_feature, pre_feature, theta): '''mtropolis hastings algorithm is used to determine whether accepts the new samples''' cur_l = self.log_linear_model.cal_scoring_function(theta, cur_feature) pre_l = self.log_linear_model.cal_scoring_function(theta, pre_feature) ratio = np.abs(np.exp(cur_l)/np.exp(pre_l)) if ratio>1: return True else: r = random.random() if r<=ratio: return True else: return False def samplerank_discourse_main(self): '''This is an example of how Samplerank algorithm works. For simplicity, I only show how to estimate parameters for discourse prediction task. theta is the model parameters that you want to estimate. get_s_feature_vector() is used to encode the input into feature vector ''' pre_score = None pre_feature = None theta = self._initialize_theta('dimension of model parameters') for e in range(self.num_epochs): # e is the index of epoch print 'epoch:', e for i in range(len(self.X)): # i is the index of input document if i not in self.train_index: continue print 'document:', i cur_data = [d[i] for d in self.data[:-2]]+list(self.data[-2:]) cur_X = self.X[i] gold_discourse = self.gold_discourses[i] n = len(cur_X) # length of edges s = self._initialize_discourse(n) pre_score = self.get_regular_scoring(gold_discourse, s, 'f1') pre_s_f = get_s_feature_vector('your input and the previous input') pre_feature = pre_s_f for t in range(self.num_samples): # t is index of samples cur_s = self.sample_discourse(s, cur_X) cur_score = self.get_regular_scoring(gold_discourse, cur_s, 'f1') w_dif = np.abs(cur_score - pre_score) # if w_dif <= self.early_stop_threshold: # break arg_max = np.argmax([pre_score, cur_score]) cur_s_f = get_s_feature_vector('your input and the current discourse labels') cur_feature = cur_s_f f_dif = np.subtract([pre_feature, cur_feature][arg_max], [pre_feature, cur_feature][np.abs(arg_max-1)]) if self.metropolis_hastings(cur_feature, pre_feature, theta): s = cur_s pre_feature = cur_feature pre_score = cur_score if (np.dot(f_dif, theta) < w_dif) and (w_dif != 0): theta = np.add(theta, np.multiply(self.get_learning_rate(), f_dif)) return theta class Log_linear_model_inference(object): ''' This is how we iteratively infer discourse and content by using dynamic programming and inter linear programming. s represents discourse and c represents content. global_params.num_*_features is the number of * features. global_params.relation_tags is the predefined discourse relations set. levenshtein computes edit distance between two vectors. ''' def __init__(self): ''' Constructor ''' pass def cal_scoring_function(self, weights, features): return np.dot(weights, features) def linear_programming(self, X, nodes, node_map, idea_candidates, heads, s, theta, clusters): scores = [] # a0 = 0 s = [len(global_params.relation_tags)]+s for r_id in range(len(s)): c_id=0 for k,v in idea_candidates[r_id].iteritems(): features0 = get_c_feature_vector_idea(node_map[nodes[r_id]], clusters, s[r_id], v, heads[r_id][c_id], 0, features=[], pre_c = None) features20 = get_c_feature_vector_idea2(node_map[nodes[r_id]], clusters, v, heads[r_id][c_id], 0, features=[], pre_c = None) features1 = get_c_feature_vector_idea(node_map[nodes[r_id]], clusters, s[r_id], v, heads[r_id][c_id], 1, features=[], pre_c = None) features21 = get_c_feature_vector_idea2(node_map[nodes[r_id]], clusters, v, heads[r_id][c_id], 1, features=[], pre_c = None) features0 = np.append(features0, features20) features1 = np.append(features1, features21) score0 = self.cal_scoring_function(theta, features0) score1 = self.cal_scoring_function(theta, features1) scores.append(score1-score0) c_id+=1 # a0 += score0 s = s[1:] cluster_score = self.map_idea_cluster(scores, clusters) lp = lp_maker(list(cluster_score), [list(np.ones(len(cluster_score)))], [-1], [1], None, None, None) z = lpsolve('set_binary', lp, range(1,len(cluster_score)+1)) solve = lpsolve('solve',lp) obj = lpsolve('get_objective', lp) x = lpsolve('get_variables', lp)[0] lpsolve('delete_lp', lp) pred = self.map_cluster_idea(x, clusters, len(scores)) # print sum([len(i) for i in heads]) return pred def inference_joint_s(self, X, nodes, clusters, node_map, idea_candidates, heads, theta, c): pi = [] s_theta = theta[:global_params.num_s_features] c_theta = theta[global_params.num_s_features:global_params.num_s_features+global_params.num_c_features] # c_theta2 = theta[global_params.num_s_features+global_params.num_c_features:] path = [np.zeros(len(X)) for i in range(len(global_params.relation_tags))] # initialization for r_index in range(len(global_params.relation_tags)): s_features = get_s_feature_vector_node(node_map[X[0][0]], node_map[X[0][1]], r_index, np.zeros(global_params.num_s_features)) c_features = get_c_feature_vector_node(node_map[nodes[1]], clusters, r_index, idea_candidates[1], heads[1], c[1], np.zeros(global_params.num_c_features)) score_s = self.cal_scoring_function(s_theta, s_features) score_c = self.cal_scoring_function(c_theta, c_features) pi.append(score_s+score_c) # recursion # dp = np.zeros(len(global_params.relation_tags)+len(idea_candidates)) for t in range(1, len(X)): cur_pi = np.zeros(len(global_params.relation_tags)) # current relation for j in range(len(global_params.relation_tags)): max_score = -1e10 # previous relation for i in range(len(global_params.relation_tags)): s_features = get_s_feature_vector_node(node_map[X[t][0]], node_map[X[t][1]], j, np.zeros(global_params.num_s_features), node_map[X[t-1][0]], node_map[X[t-1][1]], i) c_features = get_c_feature_vector_node(node_map[nodes[t+1]], clusters, j, idea_candidates[t+1], heads[t+1], c[t+1], np.zeros(global_params.num_c_features)) tmp = pi[i]+self.cal_scoring_function(s_theta, s_features)+self.cal_scoring_function(c_theta, c_features) if max_score < tmp: max_score = tmp path[j][t] = i cur_pi[j] = max_score pi = copy.deepcopy(cur_pi) return self.find_path(path, np.argmax(pi)) def inference(self, X, node_map, theta): '''use dynamic programming to infer discourse without considering content''' pi = [] s_theta = theta path = [np.zeros(len(X)) for i in range(len(global_params.relation_tags))] # initialization for r_index in range(len(global_params.relation_tags)): features = get_s_feature_vector_node(node_map[X[0][0]], node_map[X[0][1]], r_index, np.zeros(global_params.num_s_features)) pi.append(self.cal_scoring_function(s_theta, features)) # recursion for t in range(1, len(X)): cur_pi = np.zeros(len(global_params.relation_tags)) # current relation for j in range(len(global_params.relation_tags)): max_score = -1e10 # previous relation for i in range(len(global_params.relation_tags)): features = get_s_feature_vector_node(node_map[X[t][0]], node_map[X[t][1]], j, np.zeros(global_params.num_s_features), node_map[X[t-1][0]], node_map[X[t-1][1]], i) tmp = pi[i]+self.cal_scoring_function(s_theta, features) if max_score < tmp: max_score = tmp path[j][t] = i cur_pi[j] = max_score pi = copy.deepcopy(cur_pi) return self.find_path(path, np.argmax(pi)) def iterative_inference(self, X, nodes, node_map, idea_candidates, heads, theta, clusters): '''iteratively inference''' # split the model parameters into two parts for discourse and content separately. s_theta = theta[:global_params.num_s_features] c_theta = theta[global_params.num_s_features:] # first infer discourse without considering content s0 = self.inference(X, node_map, s_theta) c = self.linear_programming(X, nodes, node_map, idea_candidates, heads, s0, c_theta, clusters) e=0 old_s = s0 old_c = c while True: s = self.inference_joint_s(X, nodes, clusters, node_map, idea_candidates, heads, theta, c) c = self.linear_programming(X, nodes, node_map, idea_candidates, heads, s, c_theta, clusters) distance_s = levenshtein(''.join([str(i) for i in old_s]), ''.join([str(i) for i in s])) distance_c = levenshtein(''.join([str(i) for i in old_c]), ''.join([str(i) for i in reshape_vectors(c)])) if distance_s==0 and distance_c==0: # break when the inference results stop changing break old_s = s old_c = c if e>=10: # number of iterations break e+=1 return s,c def find_path(self, path, end): '''find out the best path based on the output of viterbi algorithm. ''' max_path = [end] for index in range(-1,-len(path[0]),-1): end = path[int(end)][int(index)] max_path = [int(end)] + max_path return max_path