0
|
1 import json
|
|
2 from datetime import datetime
|
1
|
3 from collections import defaultdict
|
0
|
4
|
|
5
|
|
6 def main():
|
1
|
7 data = defaultdict(dict)
|
|
8 date_index = defaultdict(dict)
|
|
9
|
0
|
10 with open('santander.json') as f:
|
|
11 doc = json.load(f)
|
|
12
|
|
13 if doc['JobStatus'] != 'SUCCEEDED':
|
|
14 print(f"JOB STATUS: {doc['JobStatus']}")
|
|
15
|
|
16 return
|
|
17
|
|
18 blocks = doc['Blocks']
|
1
|
19
|
|
20 # Get format
|
|
21 lines = filter_blocks(blocks, 'BlockType', 'LINE')
|
|
22 for line in lines:
|
|
23 format = get_format(line['Text'])
|
|
24 data['format'] = format
|
|
25 if format:
|
|
26 break
|
|
27
|
|
28 # Find dates value and position
|
|
29 cells = filter_blocks(blocks, 'BlockType', 'CELL')
|
|
30 for cell in cells:
|
|
31 child_ids = extract_child_ids(cell)
|
0
|
32
|
1
|
33 # Get `Text` from `CELL` block
|
|
34 cell_text = ''
|
|
35 for index, child_id in enumerate(child_ids):
|
|
36 word_block = filter_blocks(blocks, 'Id', child_id)[0]
|
|
37 cell_text += word_block['Text']
|
|
38
|
|
39 date_string = is_date(cell_text)
|
|
40 if date_string:
|
|
41 cell_text = date_string
|
|
42 date_index[date_string]['column'] = cell['ColumnIndex']
|
|
43 date_index[date_string]['row'] = cell['RowIndex']
|
|
44
|
0
|
45
|
1
|
46 cell_row_index = cell['RowIndex']
|
|
47 cell_column_index = cell['ColumnIndex']
|
|
48 data['rows'][cell_row_index][cell_column_index] = cell_text
|
|
49
|
|
50 # Delete unused rows
|
|
51 for year in date_index:
|
|
52 for row in data['rows']:
|
|
53 print(row)
|
|
54 exit()
|
|
55 if year[row] < row:
|
|
56 del data[row]
|
|
57
|
|
58 print(data)
|
|
59
|
|
60
|
|
61
|
0
|
62
|
1
|
63
|
|
64 print(data)
|
0
|
65
|
|
66
|
|
67
|
1
|
68 """
|
|
69 # Get table
|
|
70 table = filter_blocks(blocks, 'BlockType', 'TABLE')[0]
|
|
71 table_child_ids = extract_child_ids(table)
|
|
72
|
|
73 # Iterate over childs and get `CELL` blocks
|
|
74 for table_child_id in table_child_ids:
|
|
75 cell = filter_blocks(blocks, 'Id', table_child_id)[0]
|
|
76 cell_child_ids = extract_child_ids(cell)
|
|
77
|
|
78 # Get `Text` from `CELL` block
|
|
79 cell_text = ''
|
|
80 for cell_child_id in cell_child_ids:
|
|
81 word_block = filter_blocks(blocks, 'Id', cell_child_id)[0]
|
|
82 cell_text += word_block['Text']
|
|
83
|
|
84 # Check if cell_text could be a date
|
|
85 date_string = is_date(cell_text)
|
|
86 if date_string:
|
|
87 date_column_index = cell['ColumnIndex']
|
|
88 data[date_column_index] = {'year': date_string}
|
|
89 """
|
|
90
|
|
91
|
|
92 def filter_blocks(blocks, block_key, block_value):
|
|
93 """
|
|
94 Extract a block by key-value from array of blocks
|
|
95 """
|
|
96
|
|
97 return [block for block in blocks if block[block_key] == block_value]
|
|
98
|
|
99
|
0
|
100 def extract_child_ids(block):
|
1
|
101 """
|
|
102 Extract child Ids from a block
|
|
103 """
|
|
104
|
0
|
105 if not 'Relationships' in block:
|
|
106 return []
|
|
107
|
|
108 return [r['Ids'] for r in block['Relationships'] if r['Type'] == 'CHILD'][0]
|
|
109
|
|
110
|
1
|
111 def is_date(string_date):
|
|
112 """
|
|
113 Verify if a string could be a date
|
|
114 """
|
0
|
115
|
1
|
116 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
|
0
|
117
|
|
118 for format_allowed in formats_allowed:
|
|
119 try:
|
1
|
120 date = datetime.strptime(string_date, format_allowed)
|
0
|
121
|
1
|
122 return date.strftime("%Y")
|
0
|
123 except ValueError:
|
|
124
|
|
125 # Try removing characters from the beginning and end
|
|
126 options = [string_date[:-1], string_date[1:], string_date[1:-1]]
|
|
127 for option in options:
|
|
128 try:
|
1
|
129 date = datetime.strptime(option, format_allowed)
|
0
|
130
|
1
|
131 return date.strftime("%Y")
|
0
|
132 except ValueError:
|
|
133 continue
|
|
134
|
1
|
135 return
|
|
136
|
|
137
|
|
138 def get_format(phrase):
|
|
139 """
|
|
140 Given a phrase verify if it is specified the amount format
|
|
141 """
|
|
142
|
|
143 amount_formats = ['thousand', 'million', 'billion']
|
|
144
|
|
145 for amount_format in amount_formats:
|
|
146 plural_amount_format = f'{amount_format}s'
|
|
147
|
|
148 if amount_format in phrase or plural_amount_format in phrase:
|
|
149 return amount_format
|
|
150
|
|
151
|
|
152 def clean(string_type, string):
|
|
153 characters = ['.', ',', '-', ' ']
|
|
154
|
|
155 clean_string = string
|
|
156 for character in characters:
|
|
157 clean_string = clean_string.replace(character, '')
|
|
158
|
|
159 return clean_string
|
|
160
|
|
161
|
|
162 def format_amount(string_amount):
|
|
163 pass
|
0
|
164
|
|
165
|
|
166 if __name__ == '__main__':
|
1
|
167 main()
|
|
168
|
|
169 """
|
|
170 Assumptions:
|
|
171 - Thousand separator is `,`
|
|
172 - Supported date formats '%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'
|
|
173 - Accounting values are in the same column and below the date.
|
|
174 +-------+-------+
|
|
175 | 2022 | 2023 |
|
|
176 +-------+-------+
|
|
177 | 3,000 | 3,100 |
|
|
178 +-------+-------+
|
|
179 | 120 | 150 |
|
|
180 +-------+-------+
|
|
181 | 789 | 800 |
|
|
182 +-------+-------+
|
|
183 - Account names must be in column index 1
|
|
184 """ |