Mercurial > public > finance-parser
comparison main.py @ 1:e23b7617bbc4
reducing redundancy
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Wed, 31 May 2023 20:19:26 +0100 |
parents | 556768c7d3d7 |
children |
comparison
equal
deleted
inserted
replaced
0:556768c7d3d7 | 1:e23b7617bbc4 |
---|---|
1 import json | 1 import json |
2 from datetime import datetime | 2 from datetime import datetime |
3 from collections import defaultdict | |
3 | 4 |
4 | 5 |
5 def main(): | 6 def main(): |
7 data = defaultdict(dict) | |
8 date_index = defaultdict(dict) | |
9 | |
6 with open('santander.json') as f: | 10 with open('santander.json') as f: |
7 doc = json.load(f) | 11 doc = json.load(f) |
8 | 12 |
9 if doc['JobStatus'] != 'SUCCEEDED': | 13 if doc['JobStatus'] != 'SUCCEEDED': |
10 print(f"JOB STATUS: {doc['JobStatus']}") | 14 print(f"JOB STATUS: {doc['JobStatus']}") |
11 | 15 |
12 return | 16 return |
13 | 17 |
14 blocks = doc['Blocks'] | 18 blocks = doc['Blocks'] |
15 table = extract_block(blocks, 'BlockType', 'TABLE') | 19 |
16 table_child_ids = extract_child_ids(table) | 20 # Get format |
21 lines = filter_blocks(blocks, 'BlockType', 'LINE') | |
22 for line in lines: | |
23 format = get_format(line['Text']) | |
24 data['format'] = format | |
25 if format: | |
26 break | |
27 | |
28 # Find dates value and position | |
29 cells = filter_blocks(blocks, 'BlockType', 'CELL') | |
30 for cell in cells: | |
31 child_ids = extract_child_ids(cell) | |
32 | |
33 # Get `Text` from `CELL` block | |
34 cell_text = '' | |
35 for index, child_id in enumerate(child_ids): | |
36 word_block = filter_blocks(blocks, 'Id', child_id)[0] | |
37 cell_text += word_block['Text'] | |
38 | |
39 date_string = is_date(cell_text) | |
40 if date_string: | |
41 cell_text = date_string | |
42 date_index[date_string]['column'] = cell['ColumnIndex'] | |
43 date_index[date_string]['row'] = cell['RowIndex'] | |
44 | |
45 | |
46 cell_row_index = cell['RowIndex'] | |
47 cell_column_index = cell['ColumnIndex'] | |
48 data['rows'][cell_row_index][cell_column_index] = cell_text | |
49 | |
50 # Delete unused rows | |
51 for year in date_index: | |
52 for row in data['rows']: | |
53 print(row) | |
54 exit() | |
55 if year[row] < row: | |
56 del data[row] | |
57 | |
58 print(data) | |
59 | |
17 | 60 |
18 for table_child_id in table_child_ids: | 61 |
19 cell = extract_block(blocks, 'Id', table_child_id) | |
20 cell_child_ids = extract_child_ids(cell) | |
21 | 62 |
22 cell_value = '' | |
23 for index, cell_child_id in enumerate(cell_child_ids): | |
24 word_block = extract_block(blocks, 'Id', cell_child_id) | |
25 cell_value += word_block['Text'].lower() | |
26 | 63 |
27 if index < len(cell_child_ids) - 1: | 64 print(data) |
28 cell_value += '_' | |
29 | |
30 print(cell_value) | |
31 print(is_date(cell_value)) | |
32 | 65 |
33 | 66 |
34 | 67 |
68 """ | |
69 # Get table | |
70 table = filter_blocks(blocks, 'BlockType', 'TABLE')[0] | |
71 table_child_ids = extract_child_ids(table) | |
72 | |
73 # Iterate over childs and get `CELL` blocks | |
74 for table_child_id in table_child_ids: | |
75 cell = filter_blocks(blocks, 'Id', table_child_id)[0] | |
76 cell_child_ids = extract_child_ids(cell) | |
77 | |
78 # Get `Text` from `CELL` block | |
79 cell_text = '' | |
80 for cell_child_id in cell_child_ids: | |
81 word_block = filter_blocks(blocks, 'Id', cell_child_id)[0] | |
82 cell_text += word_block['Text'] | |
83 | |
84 # Check if cell_text could be a date | |
85 date_string = is_date(cell_text) | |
86 if date_string: | |
87 date_column_index = cell['ColumnIndex'] | |
88 data[date_column_index] = {'year': date_string} | |
89 """ | |
90 | |
91 | |
92 def filter_blocks(blocks, block_key, block_value): | |
93 """ | |
94 Extract a block by key-value from array of blocks | |
95 """ | |
96 | |
97 return [block for block in blocks if block[block_key] == block_value] | |
98 | |
99 | |
35 def extract_child_ids(block): | 100 def extract_child_ids(block): |
101 """ | |
102 Extract child Ids from a block | |
103 """ | |
104 | |
36 if not 'Relationships' in block: | 105 if not 'Relationships' in block: |
37 return [] | 106 return [] |
38 | 107 |
39 return [r['Ids'] for r in block['Relationships'] if r['Type'] == 'CHILD'][0] | 108 return [r['Ids'] for r in block['Relationships'] if r['Type'] == 'CHILD'][0] |
40 | 109 |
41 | 110 |
42 def extract_block(blocks, block_key, block_value): | 111 def is_date(string_date): |
43 return [block for block in blocks if block[block_key] == block_value][0] | 112 """ |
113 Verify if a string could be a date | |
114 """ | |
44 | 115 |
45 | 116 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'] |
46 def is_date(string_date): | |
47 formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'] | |
48 | 117 |
49 for format_allowed in formats_allowed: | 118 for format_allowed in formats_allowed: |
50 try: | 119 try: |
51 datetime.strptime(string_date, format_allowed) | 120 date = datetime.strptime(string_date, format_allowed) |
52 | 121 |
53 return True | 122 return date.strftime("%Y") |
54 except ValueError: | 123 except ValueError: |
55 | 124 |
56 # Try removing characters from the beginning and end | 125 # Try removing characters from the beginning and end |
57 options = [string_date[:-1], string_date[1:], string_date[1:-1]] | 126 options = [string_date[:-1], string_date[1:], string_date[1:-1]] |
58 for option in options: | 127 for option in options: |
59 try: | 128 try: |
60 datetime.strptime(option, format_allowed) | 129 date = datetime.strptime(option, format_allowed) |
61 | 130 |
62 return True | 131 return date.strftime("%Y") |
63 except ValueError: | 132 except ValueError: |
64 continue | 133 continue |
65 | 134 |
66 return False | 135 return |
136 | |
137 | |
138 def get_format(phrase): | |
139 """ | |
140 Given a phrase verify if it is specified the amount format | |
141 """ | |
142 | |
143 amount_formats = ['thousand', 'million', 'billion'] | |
144 | |
145 for amount_format in amount_formats: | |
146 plural_amount_format = f'{amount_format}s' | |
147 | |
148 if amount_format in phrase or plural_amount_format in phrase: | |
149 return amount_format | |
150 | |
151 | |
152 def clean(string_type, string): | |
153 characters = ['.', ',', '-', ' '] | |
154 | |
155 clean_string = string | |
156 for character in characters: | |
157 clean_string = clean_string.replace(character, '') | |
158 | |
159 return clean_string | |
160 | |
161 | |
162 def format_amount(string_amount): | |
163 pass | |
67 | 164 |
68 | 165 |
69 if __name__ == '__main__': | 166 if __name__ == '__main__': |
70 main() | 167 main() |
168 | |
169 """ | |
170 Assumptions: | |
171 - Thousand separator is `,` | |
172 - Supported date formats '%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y' | |
173 - Accounting values are in the same column and below the date. | |
174 +-------+-------+ | |
175 | 2022 | 2023 | | |
176 +-------+-------+ | |
177 | 3,000 | 3,100 | | |
178 +-------+-------+ | |
179 | 120 | 150 | | |
180 +-------+-------+ | |
181 | 789 | 800 | | |
182 +-------+-------+ | |
183 - Account names must be in column index 1 | |
184 """ |