forked from cubenlp/conll2015_discourse
-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
executable file
·222 lines (188 loc) · 6.09 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#coding:utf-8
import string
from multiprocessing import Process
from feature import Feature
from example import Example
from random import shuffle
import json
from tools.PorterStemmer import PorterStemmer
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#Singleton,usage: @singleton...
def singleton(cls):
instances = {}
def _singleton(*args, **kw):
if cls not in instances:
instances[cls] = cls(*args, **kw)
return instances[cls]
return _singleton
def getSpanIndecesInSent(span_tokens, sent_tokens):
indice = []
span_length = len(span_tokens); sent_length = len(sent_tokens)
for i in xrange(len(sent_tokens)):
if (i+span_length) <= sent_length and sent_tokens[i:i+span_length] == span_tokens:
indice.append(range(i,i+span_length))
return indice
''' remove punctuation in string '''
def removePuctuation(s):
exclude = string.punctuation + "``" + "''"
s = ''.join(ch for ch in s if ch not in exclude)
return s
''' run multiple threads'''
def run_multiple_threads(feature_function_list):
procs = []
for feat_fun in feature_function_list:
p = Process(target = feat_fun)
procs.append(p)
for p in procs: p.start()
for p in procs: p.join()
# merge all the features in the feature_list
def mergeFeatures(feature_list, name = ""):
dimension = 0
feat_string = ""
for feature in feature_list:
if dimension == 0:# first one
feat_string = feature.feat_string
else:
if feature.feat_string != "":
# change the indices of the current feature.
temp = ""
for item in feature.feat_string.split(" "):
index, value = item.split(":")
temp += " %d:%s" % (int(index)+dimension, value)
feat_string += temp
dimension += feature.dimension
merged_feature = Feature(name, dimension,{})
merged_feature.feat_string = feat_string.strip()
return merged_feature
# remove the item (value < threshold) from dict
def removeItemsInDict(dict, threshold = 1):
if threshold > 1 :
for key in dict.keys():
if dict[key] < threshold:
dict.pop(key)
# write dict keys to file
def write_dict_keys_to_file(dict, file_path):
file_out = open(file_path,"w")
file_out.write("\n".join([str(key) for key in dict.keys()]))
file_out.close()
def load_list_from_file(list_file_path):
list_file = open(list_file_path)
list = [line.strip() for line in list_file]
return list
def load_set_from_file(list_file_path):
list_file = open(list_file_path)
list = [line.strip() for line in list_file]
return set(list)
# key : index
def load_dict_from_file(dict_file_path):
dict = {}
dict_file = open(dict_file_path)
lines = [line.strip() for line in dict_file]
for index, line in enumerate(lines):
if line == "":
continue
dict[line] = index+1
dict_file.close()
return dict
def write_example_list_to_file(example_list, to_file):
to_file = open(to_file, "w")
to_file.write("\n".join([example.content + " # " + example.comment for example in example_list]))
to_file.close()
def write_shuffled_example_list_to_file(example_list, to_file):
shuffle(example_list)
write_example_list_to_file(example_list, to_file)
def get_compressed_path(path):
list = path.split("-->")
temp = []
for i in range(len(list)):
if i+1 < len(list) and list[i] != list[i+1] :
temp.append(list[i])
if i+1 == len(list):
temp.append(list[i])
return "-->".join(temp)
def get_compressed_path_tag(path, Tag):
list = path.split(Tag)
temp = []
for i in range(len(list)):
if i+1 < len(list) and list[i] != list[i+1] :
temp.append(list[i])
if i+1 == len(list):
temp.append(list[i])
return Tag.join(temp)
def write_dict_list_to_json_file(dict_list, json_path):
fout = open(json_path, 'w')
strs = [json.dumps(innerdict) for innerdict in dict_list]
s = "%s" % "\n".join(strs)
fout.write(s)
#设置字典,value为key出现的频数
# set key value in dict where value is the frequency of the key
def set_dict_key_value(dict, key):
if key not in dict:
dict[key] = 0
dict[key] += 1
def list_strip_punctuation(list):
punctuation = """!"#&'*+,-..../:;<=>?@[\]^_`|~""" + "``" + "''"
i = 0
while i < len(list) and list[i][1] in punctuation + "-LCB--LRB-":
i += 1
if i == len(list):
return []
j = len(list) - 1
while j >= 0 and list[j][1] in punctuation + "-RRB--RCB-":
j -= 1
return list[i: j+1]
def stem_string(line):
if line == "":
return ""
p = PorterStemmer()
word = ""
output = ""
for c in line:
if c.isalpha():
word += c.lower()
else:
if word:
output += p.stem(word, 0,len(word)-1)
word = ''
output += c.lower()
if word:
output += p.stem(word, 0,len(word)-1)
return output
def stem_list(list):
return [stem_string(item) for item in list]
def cross_product(list1, list2):
t = []
for i in list1:
for j in list2:
t.append(i * j)
return t
def is_number(number):
return re.match(r"^(-?\d+)(\.\d+)?$", number) != None
def vec_plus_vec(vec1, vec2):
if len(vec1) != len(vec2):
raise ValueError("vec1 and vec2 do not have the same length !")
t = []
for v1, v2 in zip(vec1, vec2):
t.append(v1 + v2)
return t
def lemma_word(word, pos):
lmtzr = WordNetLemmatizer()
word = word.lower()
pos = get_wn_pos(pos)
if pos == "":
return word
word = lmtzr.lemmatize(word, pos)
return word
def get_wn_pos(tree_bank_tag):
if tree_bank_tag.startswith('J'):
return wordnet.ADJ
elif tree_bank_tag.startswith('V'):
return wordnet.VERB
elif tree_bank_tag.startswith('N'):
return wordnet.NOUN
elif tree_bank_tag.startswith('R'):
return wordnet.ADV
else:
return ''