Mercurial > public > finance-parser
annotate process_document/app.py @ 13:ab988fd1e6fa
fix minor bugs
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Sat, 10 Jun 2023 14:45:38 +0100 |
parents | bf19235a9636 |
children |
rev | line source |
---|---|
2 | 1 import json |
2 import boto3 | |
3 from datetime import datetime | |
4 from collections import defaultdict | |
5 | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
6 |
2 | 7 s3_client = boto3.client('s3') |
8 | |
9 | |
10 def lambda_handler(event, context): | |
4 | 11 event_msg = event['body']['message'] |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
12 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
13 # Download file from s3 |
4 | 14 s3_client.download_file( |
15 event_msg['bucketName'], | |
16 event_msg['objectKey'], | |
17 '/tmp/document.json' | |
18 ) | |
2 | 19 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
20 with open('/tmp/document.json') as f: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
21 doc = json.load(f) |
2 | 22 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
23 # Analyze document |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
24 result = defaultdict(dict) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
25 blocks = doc['Blocks'] |
2 | 26 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
27 # Get format |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
28 lines = filter_blocks(blocks, 'BlockType', 'LINE') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
29 for line in lines: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
30 amount_format = get_format(line['Text']) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
31 result['format'] = amount_format |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
32 if amount_format: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
33 break |
2 | 34 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
35 # Find dates value and position |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
36 data = defaultdict(dict) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
37 cells = filter_blocks(blocks, 'BlockType', 'CELL') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
38 for cell in cells: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
39 if not 'Relationships' in cell: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
40 continue |
2 | 41 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
42 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
43 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
44 # Get `Text` from `CELL` block |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
45 cell_text = '' |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
46 for index, child_id in enumerate(child_ids): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
47 word_block = filter_blocks(blocks, 'Id', child_id)[0] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
48 cell_text += word_block['Text'] |
2 | 49 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
50 if index < len(child_ids) - 1: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
51 cell_text += '_' |
2 | 52 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
53 # Verify if `Text` could be a valid date |
6 | 54 date_string = is_date(cell_text) |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
55 if date_string: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
56 cell_text = date_string |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
57 result['dateRow'] = cell['RowIndex'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
58 result['dateColumns'][cell['ColumnIndex']] = date_string |
2 | 59 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
60 cell_row_index = cell['RowIndex'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
61 cell_column_index = cell['ColumnIndex'] |
4 | 62 data[cell_row_index][cell_column_index] = clean_text(cell_text) |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
63 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
64 try: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
65 data[cell_row_index]['type'] = cell['EntityTypes'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
66 except KeyError: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
67 pass |
2 | 68 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
69 # Delete unused row and columns |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
70 for row_index in list(data.keys()): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
71 row = data[row_index] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
72 for column_index in list(row.keys()): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
73 if column_index not in result['dateColumns'] \ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
74 and column_index != 1 and column_index != 'type': |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
75 del row[column_index] |
2 | 76 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
77 if len(row) > 1: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
78 result['data'][row_index] = row |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
79 |
4 | 80 object_key = event_msg['objectKey'].replace('analyzed/', 'processed/') |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
81 data_string = json.dumps(result, indent=2, default=str) |
2 | 82 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
83 s3_client.put_object( |
4 | 84 Bucket=event_msg['bucketName'], |
85 Key=object_key, | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
86 Body=data_string |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
87 ) |
2 | 88 |
89 return { | |
90 "statusCode": 200, | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
91 "body": { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
92 "message": { |
4 | 93 "companyTicker": event_msg['companyTicker'], |
94 "docType": event_msg['docType'], | |
95 "fileId": event_msg['fileId'], | |
96 "fileName": event_msg['fileName'], | |
97 "objectKey": object_key, | |
98 "bucketName": event_msg['bucketName'] | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
99 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
100 }, |
2 | 101 } |
102 | |
103 | |
104 def filter_blocks(blocks, block_key, block_value): | |
105 """ | |
106 Extract a block by key-value from array of blocks | |
107 """ | |
108 | |
109 return [block for block in blocks if block[block_key] == block_value] | |
110 | |
111 | |
112 def is_date(string_date): | |
113 """ | |
114 Verify if a string could be a date. | |
115 """ | |
116 | |
4 | 117 formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y'] |
2 | 118 |
119 for format_allowed in formats_allowed: | |
120 try: | |
121 date = datetime.strptime(string_date, format_allowed) | |
122 | |
123 if date.year > datetime.now().year or date.year < 1900: | |
6 | 124 return # Fecha fuera de rango |
2 | 125 |
126 return date.strftime("%Y") | |
127 except ValueError: | |
6 | 128 |
129 # Try removing characters from the beginning and end | |
130 options = [string_date[:-1], string_date[1:], string_date[1:-1]] | |
131 for option in options: | |
132 try: | |
133 date = datetime.strptime(option, format_allowed) | |
134 | |
135 if date.year > datetime.now().year or date.year < 1900: | |
136 return # Fecha fuera de rango | |
137 | |
138 return date.strftime("%Y") | |
139 except ValueError: | |
140 continue | |
2 | 141 |
142 return | |
143 | |
144 | |
145 def get_format(phrase): | |
146 """ | |
147 Given a phrase verify if it is specified the amount format | |
148 """ | |
149 | |
150 amount_formats = ['thousand', 'million', 'billion'] | |
151 | |
152 for amount_format in amount_formats: | |
153 plural_amount_format = f'{amount_format}s' | |
154 | |
9
bf19235a9636
minor bugs and add sample reports
Dennis C. M. <dennis@denniscm.com>
parents:
6
diff
changeset
|
155 if amount_format in phrase.lower() or plural_amount_format in phrase.lower(): |
2 | 156 return amount_format |
157 | |
158 | |
4 | 159 def clean_text(text, text_type='default'): |
2 | 160 """" |
161 Remove bad characters from word | |
162 """ | |
163 | |
4 | 164 special_chars = [ |
165 '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', | |
166 '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|', | |
167 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ',' | |
168 ] | |
169 | |
170 if text_type == 'date': | |
171 allowed_chars = ['_', '-', '/'] | |
172 else: | |
6 | 173 allowed_chars = ['_'] |
4 | 174 |
175 special_chars = [char for char in special_chars if char not in allowed_chars] | |
176 | |
177 for char in special_chars: | |
178 text = text.replace(char, '') | |
2 | 179 |
180 return text.lower() |