-
Notifications
You must be signed in to change notification settings - Fork 7
/
bracketed2dsearch.py
183 lines (150 loc) · 4.78 KB
/
bracketed2dsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import six
assert six.PY3, "Please run me with Python3"
import ply.lex as lex
import ply.yacc as yacc
import readline
import urllib.parse
import requests
import sys
class Node:
def __init__(self,dtype,children):
self.dtype=dtype
self.children=children
def dsearch_ex_lin(self):
#cases like [dep xxx xxx xxx xxx]
assert sum(1 for c in self.children if isinstance(c,str))==len(self.children)
exprs=[]
for root_idx,root in enumerate(self.children):
expr=['"'+root+'"']
for other_idx,other in enumerate(self.children):
if other_idx<root_idx:
expr.append('>lin@L "{}"'.format(other))
elif other_idx>root_idx:
expr.append('>lin@R "{}"'.format(other))
exprs.append("("+(" ".join(expr))+")")
return "("+(" | ".join(exprs))+")"
def dsearch_ex(self):
global macros
#Now I guess I pick one of my STRING children to be the root or what?
possible_roots=[c for c in self.children if isinstance(c,str)]
if len(possible_roots)==len(self.children) and len(self.children)>1:
return self.dsearch_ex_lin()
elif len(possible_roots)>1:
raise ValueError("Unsupported")
assert len(possible_roots)==1
for r in possible_roots:
bits=["(",macros.get(r,'"'+r+'"')] #Bits of the expression
for c in self.children:
if c==r:
continue
if isinstance(c,str):
bits.extend(['>',macros.get(c,'"'+c+'"')])
elif isinstance(c,Node):
if c.dtype=="dep" or c.dtype=="_":
bits.append(' > ')
else:
bits.append(' >'+c.dtype)
bits.append(c.dsearch_ex())
else:
assert False, repr(c)
bits.append(")")
return " ".join(bits)#I guess I should then generate the others too?
### ---------- lexer -------------
# List of token names. This is always required
tokens = ('LBRAC','RBRAC','STRING')
def t_LBRAC(t):
r'\['
return t
def t_RBRAC(t):
r'\]'
return t
def t_STRING(t):
r'[^\s\[\]]+'
return t
# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'
# Error handling rule
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
lexer = lex.lex()
### --------- grammar -----------
def p_expressions(p):
'''expressions : expression
| expression expressions
'''
if len(p)==2:
p[0]=[p[1]]
elif len(p)==3:
p[0]=[p[1]]+p[2]
else:
assert False
def p_expr(p):
'''expression : tree
| STRING
'''
p[0]=p[1]
def p_tree(p):
'tree : LBRAC STRING expressions RBRAC'
p[0]=Node(p[2],p[3])
def p_error(p):
print("Syntax error in input!")
parser = yacc.yacc()
def get_query_url(q):
url="http://bionlp-www.utu.fi/dep_search/query"
url+="?"+urllib.parse.urlencode({"search":q,"db":"RU160M","case_sensitive":"False","hits_per_page":"50"})
return url
def download(qry,maxnum,fname):
data={"search":qry,"db":"RU160M","case":"False","retmax":maxnum}
result=requests.get("http://epsilon-it.utu.fi/dep_search_webapi",params=data)
print(result.url)
with open(fname,"w") as f:
print(result.text,file=f)
### ---------- run this ------------
# * NP-Nom = NOUN Case=Nom
# * XP = any phrasal category = NOUN, ADJ, ADV, PRON, VERB
# * PRON-Dat = PRON Case=Dat
# * NOUN-Nom = NOUN Case=Nom
# * VP = VERB
# * AP = ADJ
# * VP-Inf = VERB VerbForm=Inf
# * Imper = Mood=Imp
# * dep = any dependency label
macros_def="""
NP-Nom : (NOUN&Nom)
NP-Dat : (NOUN&Dat)
XP : (NOUN|ADJ|ADV|PRON|VERB)
PRON-Dat : (PRON&Dat)
NOUN-Nom : (NOUN&Nom)
VP : VERB
AP : ADJ
VP-Inf : (VERB&Inf)
VP-Imper : (VERB&Mood=Imp)
V-Past : (VERB&Past)
Imper : (Mood=Imp)
Cl : (VERB >nsubj _)
_ : _
"""
macros={} #macro -> replacement
for repl in macros_def.strip().split("\n"):
src,trg=repl.split(" : ",1)
macros[src]=trg
expressions={} #filename -> list of expressions
for line in sys.stdin:
line=line.strip()
if not line:
continue
if line.startswith("["):
#an expression
expression_list.append(line)
else: #construction name
line=line.replace(" ","_")
expression_list=[]
expressions[line]=expression_list
for fname,expression_list in sorted(expressions.items()):
for expression in expression_list:
print("Parsing expression", expression, file=sys.stderr, flush=True)
node = parser.parse(expression)
qry=node[0].dsearch_ex()
print(qry)
download(qry,5,"dl/"+fname+".conllu")