Mercurial > public > finance-parser
annotate process_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | ef8a4d95755a |
children | 9005b7590008 |
rev | line source |
---|---|
2 | 1 import json |
2 import boto3 | |
3 from datetime import datetime | |
4 from collections import defaultdict | |
5 | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
6 |
2 | 7 s3_client = boto3.client('s3') |
8 | |
9 | |
10 def lambda_handler(event, context): | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
11 event_message = event['body']['message'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
12 object_key = event_message['objectKey'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
13 bucket_name = event_message['bucketName'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
14 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
15 # Download file from s3 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
16 s3_client.download_file(bucket_name, object_key, '/tmp/document.json') |
2 | 17 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
18 with open('/tmp/document.json') as f: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
19 doc = json.load(f) |
2 | 20 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
21 # Analyze document |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
22 result = defaultdict(dict) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
23 blocks = doc['Blocks'] |
2 | 24 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
25 # Get format |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
26 lines = filter_blocks(blocks, 'BlockType', 'LINE') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
27 for line in lines: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
28 amount_format = get_format(line['Text']) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
29 result['format'] = amount_format |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
30 if amount_format: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
31 break |
2 | 32 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
33 # Find dates value and position |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
34 data = defaultdict(dict) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
35 cells = filter_blocks(blocks, 'BlockType', 'CELL') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
36 for cell in cells: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
37 if not 'Relationships' in cell: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
38 continue |
2 | 39 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
40 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
41 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
42 # Get `Text` from `CELL` block |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
43 cell_text = '' |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
44 for index, child_id in enumerate(child_ids): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
45 word_block = filter_blocks(blocks, 'Id', child_id)[0] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
46 cell_text += word_block['Text'] |
2 | 47 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
48 if index < len(child_ids) - 1: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
49 cell_text += '_' |
2 | 50 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
51 # Verify if `Text` could be a valid date |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
52 date_string = is_date(cell_text) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
53 if date_string: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
54 cell_text = date_string |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
55 result['dateRow'] = cell['RowIndex'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
56 result['dateColumns'][cell['ColumnIndex']] = date_string |
2 | 57 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
58 cell_row_index = cell['RowIndex'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
59 cell_column_index = cell['ColumnIndex'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
60 data[cell_row_index][cell_column_index] = clean(cell_text) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
61 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
62 try: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
63 data[cell_row_index]['type'] = cell['EntityTypes'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
64 except KeyError: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
65 pass |
2 | 66 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
67 # Delete unused row and columns |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
68 for row_index in list(data.keys()): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
69 row = data[row_index] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
70 for column_index in list(row.keys()): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
71 if column_index not in result['dateColumns'] \ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
72 and column_index != 1 and column_index != 'type': |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
73 del row[column_index] |
2 | 74 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
75 if len(row) > 1: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
76 result['data'][row_index] = row |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
77 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
78 filename = object_key.replace('analyzed/', 'processed/') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
79 data_string = json.dumps(result, indent=2, default=str) |
2 | 80 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
81 s3_client.put_object( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
82 Bucket=bucket_name, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
83 Key=filename, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
84 Body=data_string |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
85 ) |
2 | 86 |
87 return { | |
88 "statusCode": 200, | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
89 "body": { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
90 "message": { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
91 "objectKey": filename, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
92 "bucketName": bucket_name |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
93 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
2
diff
changeset
|
94 }, |
2 | 95 } |
96 | |
97 | |
98 def filter_blocks(blocks, block_key, block_value): | |
99 """ | |
100 Extract a block by key-value from array of blocks | |
101 """ | |
102 | |
103 return [block for block in blocks if block[block_key] == block_value] | |
104 | |
105 | |
106 def is_date(string_date): | |
107 """ | |
108 Verify if a string could be a date. | |
109 | |
110 -> Funciona pero es un desastre <- | |
111 """ | |
112 | |
113 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'] | |
114 | |
115 for format_allowed in formats_allowed: | |
116 try: | |
117 date = datetime.strptime(string_date, format_allowed) | |
118 | |
119 if date.year > datetime.now().year or date.year < 1900: | |
120 return # Fecha fuera de rango | |
121 | |
122 return date.strftime("%Y") | |
123 except ValueError: | |
124 | |
125 # Try removing characters from the beginning and end | |
126 options = [string_date[:-1], string_date[1:], string_date[1:-1]] | |
127 for option in options: | |
128 try: | |
129 date = datetime.strptime(option, format_allowed) | |
130 | |
131 if date.year > datetime.now().year or date.year < 1900: | |
132 return # Fecha fuera de rango | |
133 | |
134 return date.strftime("%Y") | |
135 except ValueError: | |
136 continue | |
137 | |
138 return | |
139 | |
140 | |
141 def get_format(phrase): | |
142 """ | |
143 Given a phrase verify if it is specified the amount format | |
144 """ | |
145 | |
146 amount_formats = ['thousand', 'million', 'billion'] | |
147 | |
148 for amount_format in amount_formats: | |
149 plural_amount_format = f'{amount_format}s' | |
150 | |
151 if amount_format in phrase or plural_amount_format in phrase: | |
152 return amount_format | |
153 | |
154 | |
155 def clean(text): | |
156 """" | |
157 Remove bad characters from word | |
158 """ | |
159 | |
160 characters = ['.', ',', '-', ' '] | |
161 | |
162 for character in characters: | |
163 text = text.replace(character, '') | |
164 | |
165 return text.lower() |