-
Notifications
You must be signed in to change notification settings - Fork 0
/
reorder_sgml.py
119 lines (102 loc) · 3.04 KB
/
reorder_sgml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import io, re
from collections import defaultdict
PRIORITIES = ["sp","table","row","cell","head","p","figure","caption","list","item","quote","s","q","hi","sic","ref",
"date","incident","w"]
class Span:
def __init__(self,start=0,end=0,text="", elem="", priorities=PRIORITIES):
self.start = start
self.end = end
self.text = text
self.elem = elem
self.length = end - start
self.priority = priorities.index(elem) if elem in priorities else 100
def __repr__(self):
return str(self.start) + "-" + str(self.end) + ": " + self.text
def reorder(tt_sgml,priorities=PRIORITIES):
# Pass 1: build data model
open_elems = defaultdict(list)
lines = tt_sgml.split("\n")
spans = []
toknum = 1
for line in lines:
if line.startswith("</") and line.endswith(">"): # Close element
elem = re.search(r'^</([^\s>]*)',line).group(1)
if elem not in open_elems:
raise IOError("! saw a closed element: " + line + " but no corresponding element is open!\n")
span = Span(start=open_elems[elem][-1][0],end=toknum,text=open_elems[elem][-1][1],elem=elem,priorities=priorities)
open_elems[elem].pop()
if len(open_elems[elem]) == 0:
del open_elems[elem]
spans.append(span)
elif (line.startswith("<") and line.endswith("/>")) or line.startswith("<?"): # Unary element, treat like token
toknum += 1
elif line.startswith("<") and line.endswith(">"): # Open element
elem = re.search(r'^<([^\s>]*)',line).group(1)
open_elems[elem].append((toknum, line))
elif len(line.strip()) > 0:
toknum += 1
# Build start/end dictionaries
start_dict = defaultdict(list)
end_dict = defaultdict(list)
for span in spans:
start_dict[span.start].append(span)
end_dict[span.end].append(span)
# Pass 2: reorder
output = []
toknum = 1
for line in lines:
if ((line.startswith("<") and line.endswith(">")) or len(line.strip()) == 0) and \
not (line.startswith("<?") or (line.startswith("<") and line.endswith("/>"))):
continue
starting = start_dict[toknum]
elems = sorted(starting, key=lambda x: (-x.length, x.priority))
for elem in elems:
output.append(elem.text)
output.append(line)
toknum += 1
ending = end_dict[toknum]
elems = sorted(ending, key=lambda x: (x.length, -x.priority))
for elem in elems:
output.append("</" + elem.elem + ">")
return "\n".join(output) + "\n"
if __name__ == "__main__":
test_sgml = """<?xml version="1.0" ?>
<text dateCollected="2019-11-05" dateCreated="2019-02-04" dateModified="2019-04-23" id="autogum_bio_doc031" shortTile="ingo-ruczinski" sourceURL="https://en.wikipedia.org/wiki/Ingo_Ruczinski" speakerCount="0" speakerList="none" title="Ingo Ruczinski" type="bio">
<head>
<s>
Awards
and
honors
</s>
</head>
<p>
<s>
In
2016
</s>
<s>
Ingo
Ruczinski
had
became
an
elected
fellow
of
the
<ref target="https://en.wikipedia.org/wiki/American_Statistical_Association">
American
Statistical
Association
</ref>
.
<ref target="https://en.wikipedia.org/wiki/Main_Page#cite_note-2">
[
2
]
</s>
</ref>
</p>
</text>"""
test_sgml = io.open("test_reorder.sgml",encoding="utf8").read()
print(reorder(test_sgml))