forked from openspending/dpkg-uk25k
-
Notifications
You must be signed in to change notification settings - Fork 0
/
export.py
58 lines (50 loc) · 2.45 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import sqlaload as sl
import csv
import sys
import logging
from common import *
if __name__ == '__main__':
logging.basicConfig()
logging.getLogger(__name__).setLevel(logging.DEBUG)
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)
supplier_map = {}
engine = db_connect()
supplier_table = sl.get_table(engine, 'suppliers')
for row in sl.all(engine, supplier_table):
supplier_map[ row['original'] ] = row
writer = csv.DictWriter(open(sys.argv[1], 'wb'),
['SupplierName', 'DepartmentFamily', 'Entity',
'ExpenseType', 'ExpenseArea', 'VATNumber',
'AmountAsFloat',
'DateAsDate',
'SupplierGuess', 'SupplierGuessURI',
'Resource', 'Table',
'Publisher', 'SourceDescription',
],
extrasaction='ignore')
writer.writeheader()
source_table = sl.get_table(engine, 'source')
condensed_table = sl.get_table(engine, 'condensed')
for condensed_row in sl.all(engine, condensed_table):
resource_id = condensed_row['resource_id']
table_id = condensed_row['table_id']
table_suffix = '%s_table%s' % (resource_id, table_id)
source_row = sl.find_one(engine, source_table, resource_id=resource_id)
publisher = source_row['publisher']
source_description = source_row['description']
table = sl.get_table(engine, 'spending_%s' % table_suffix)
for row in sl.all(engine, table):
# Skip over rows that contain no usable data
if not row.has_key('SupplierName') or row.get('AmountAsFloat') is None or row.get('DateAsDate') is None:
continue
# Bring in all the other bits of data we've been collecting
row['Resource'] = resource_id
row['Table'] = table_id
supplier_guess = supplier_map.get(row['SupplierName'])
if supplier_guess is not None:
row['SupplierGuess'] = supplier_guess['name']
row['SupplierGuessURI'] = supplier_guess['uri']
row['Publisher'] = publisher
row['SourceDescription'] = source_description
# We need to flatten the unicode objects into utf8, because python csv is braindead
writer.writerow({k:unicode(v).encode('utf8') for k,v in row.iteritems()})