-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
executable file
·59 lines (53 loc) · 1.8 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
import json
import lxml.html
def _parse_table(html):
'Parse a table from an html element'
table = html.xpath('//table[@class="Table"]')[0]
year = html.xpath('id("ctl00_ContentPlaceHolder1_lbl_gpAwdYr")/text()')[0].split(' ')[-1]
trs = table.xpath('tr[position()>1]')
data = []
for tr in trs:
row = _parse_row(tr)
row['year'] = year
data.append(row)
return data
def _parse_row(tr):
state = tr.xpath('td[position()=2]/text()')[0].strip()
district = tr.xpath('td[position()=3]/text()')[0].strip()
block = tr.xpath('td[position()=4]/text()')[0].strip()
panchayat = tr.xpath('td[position()=5]/text()')[0].strip()
return {
"state": state,
"district": district,
"block": block,
"panchayat": panchayat,
}
def _csv(data):
out = 'Year,State Name,District Name,Block Name,Panchayat Name\r'
for row in data:
out += '%(year)s,%(state)s,%(district)s,%(block)s,%(panchayat)s\r' % row
return out
def _msewage(data):
'Convert to this format https://github.com/jcmuller/msewage-importer'
msewage = { "sources": [] }
for row in data:
msewage['sources'].append({
"name": row['panchayat'],
"location": '%(panchayat)s, %(block)s, %(district)s, %(state)s, India' % row,
"description": 'Updated in %s' % row['year']
})
return json.dumps(msewage)
def main(fixture_file, format):
html = lxml.html.parse(fixture_file)
data = _parse_table(html)
if format == 'msewage':
return _msewage(data)
elif format == 'csv':
return _csv(data)
if __name__ == '__main__':
import sys
if len(sys.argv) == 3:
print(main(sys.argv[1], sys.argv[2]))
else:
raise TypeError('USAGE: %s [html file]' % sys.argv[0])