comparison main.py @ 1:e23b7617bbc4

reducing redundancy
author Dennis C. M. <dennis@denniscm.com>
date Wed, 31 May 2023 20:19:26 +0100
parents 556768c7d3d7
children
comparison
equal deleted inserted replaced
0:556768c7d3d7 1:e23b7617bbc4
1 import json 1 import json
2 from datetime import datetime 2 from datetime import datetime
3 from collections import defaultdict
3 4
4 5
5 def main(): 6 def main():
7 data = defaultdict(dict)
8 date_index = defaultdict(dict)
9
6 with open('santander.json') as f: 10 with open('santander.json') as f:
7 doc = json.load(f) 11 doc = json.load(f)
8 12
9 if doc['JobStatus'] != 'SUCCEEDED': 13 if doc['JobStatus'] != 'SUCCEEDED':
10 print(f"JOB STATUS: {doc['JobStatus']}") 14 print(f"JOB STATUS: {doc['JobStatus']}")
11 15
12 return 16 return
13 17
14 blocks = doc['Blocks'] 18 blocks = doc['Blocks']
15 table = extract_block(blocks, 'BlockType', 'TABLE') 19
16 table_child_ids = extract_child_ids(table) 20 # Get format
21 lines = filter_blocks(blocks, 'BlockType', 'LINE')
22 for line in lines:
23 format = get_format(line['Text'])
24 data['format'] = format
25 if format:
26 break
27
28 # Find dates value and position
29 cells = filter_blocks(blocks, 'BlockType', 'CELL')
30 for cell in cells:
31 child_ids = extract_child_ids(cell)
32
33 # Get `Text` from `CELL` block
34 cell_text = ''
35 for index, child_id in enumerate(child_ids):
36 word_block = filter_blocks(blocks, 'Id', child_id)[0]
37 cell_text += word_block['Text']
38
39 date_string = is_date(cell_text)
40 if date_string:
41 cell_text = date_string
42 date_index[date_string]['column'] = cell['ColumnIndex']
43 date_index[date_string]['row'] = cell['RowIndex']
44
45
46 cell_row_index = cell['RowIndex']
47 cell_column_index = cell['ColumnIndex']
48 data['rows'][cell_row_index][cell_column_index] = cell_text
49
50 # Delete unused rows
51 for year in date_index:
52 for row in data['rows']:
53 print(row)
54 exit()
55 if year[row] < row:
56 del data[row]
57
58 print(data)
59
17 60
18 for table_child_id in table_child_ids: 61
19 cell = extract_block(blocks, 'Id', table_child_id)
20 cell_child_ids = extract_child_ids(cell)
21 62
22 cell_value = ''
23 for index, cell_child_id in enumerate(cell_child_ids):
24 word_block = extract_block(blocks, 'Id', cell_child_id)
25 cell_value += word_block['Text'].lower()
26 63
27 if index < len(cell_child_ids) - 1: 64 print(data)
28 cell_value += '_'
29
30 print(cell_value)
31 print(is_date(cell_value))
32 65
33 66
34 67
68 """
69 # Get table
70 table = filter_blocks(blocks, 'BlockType', 'TABLE')[0]
71 table_child_ids = extract_child_ids(table)
72
73 # Iterate over childs and get `CELL` blocks
74 for table_child_id in table_child_ids:
75 cell = filter_blocks(blocks, 'Id', table_child_id)[0]
76 cell_child_ids = extract_child_ids(cell)
77
78 # Get `Text` from `CELL` block
79 cell_text = ''
80 for cell_child_id in cell_child_ids:
81 word_block = filter_blocks(blocks, 'Id', cell_child_id)[0]
82 cell_text += word_block['Text']
83
84 # Check if cell_text could be a date
85 date_string = is_date(cell_text)
86 if date_string:
87 date_column_index = cell['ColumnIndex']
88 data[date_column_index] = {'year': date_string}
89 """
90
91
92 def filter_blocks(blocks, block_key, block_value):
93 """
94 Extract a block by key-value from array of blocks
95 """
96
97 return [block for block in blocks if block[block_key] == block_value]
98
99
35 def extract_child_ids(block): 100 def extract_child_ids(block):
101 """
102 Extract child Ids from a block
103 """
104
36 if not 'Relationships' in block: 105 if not 'Relationships' in block:
37 return [] 106 return []
38 107
39 return [r['Ids'] for r in block['Relationships'] if r['Type'] == 'CHILD'][0] 108 return [r['Ids'] for r in block['Relationships'] if r['Type'] == 'CHILD'][0]
40 109
41 110
42 def extract_block(blocks, block_key, block_value): 111 def is_date(string_date):
43 return [block for block in blocks if block[block_key] == block_value][0] 112 """
113 Verify if a string could be a date
114 """
44 115
45 116 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
46 def is_date(string_date):
47 formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
48 117
49 for format_allowed in formats_allowed: 118 for format_allowed in formats_allowed:
50 try: 119 try:
51 datetime.strptime(string_date, format_allowed) 120 date = datetime.strptime(string_date, format_allowed)
52 121
53 return True 122 return date.strftime("%Y")
54 except ValueError: 123 except ValueError:
55 124
56 # Try removing characters from the beginning and end 125 # Try removing characters from the beginning and end
57 options = [string_date[:-1], string_date[1:], string_date[1:-1]] 126 options = [string_date[:-1], string_date[1:], string_date[1:-1]]
58 for option in options: 127 for option in options:
59 try: 128 try:
60 datetime.strptime(option, format_allowed) 129 date = datetime.strptime(option, format_allowed)
61 130
62 return True 131 return date.strftime("%Y")
63 except ValueError: 132 except ValueError:
64 continue 133 continue
65 134
66 return False 135 return
136
137
138 def get_format(phrase):
139 """
140 Given a phrase verify if it is specified the amount format
141 """
142
143 amount_formats = ['thousand', 'million', 'billion']
144
145 for amount_format in amount_formats:
146 plural_amount_format = f'{amount_format}s'
147
148 if amount_format in phrase or plural_amount_format in phrase:
149 return amount_format
150
151
152 def clean(string_type, string):
153 characters = ['.', ',', '-', ' ']
154
155 clean_string = string
156 for character in characters:
157 clean_string = clean_string.replace(character, '')
158
159 return clean_string
160
161
162 def format_amount(string_amount):
163 pass
67 164
68 165
69 if __name__ == '__main__': 166 if __name__ == '__main__':
70 main() 167 main()
168
169 """
170 Assumptions:
171 - Thousand separator is `,`
172 - Supported date formats '%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'
173 - Accounting values are in the same column and below the date.
174 +-------+-------+
175 | 2022 | 2023 |
176 +-------+-------+
177 | 3,000 | 3,100 |
178 +-------+-------+
179 | 120 | 150 |
180 +-------+-------+
181 | 789 | 800 |
182 +-------+-------+
183 - Account names must be in column index 1
184 """