Mercurial > public > finance-parser
comparison process_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | ef8a4d95755a |
children | 9005b7590008 |
comparison
equal
deleted
inserted
replaced
2:ef8a4d95755a | 3:2e5f3664f3e4 |
---|---|
1 import json | 1 import json |
2 import boto3 | 2 import boto3 |
3 from datetime import datetime | 3 from datetime import datetime |
4 from collections import defaultdict | 4 from collections import defaultdict |
5 | 5 |
6 | |
6 s3_client = boto3.client('s3') | 7 s3_client = boto3.client('s3') |
7 textract_client = boto3.client('textract') | |
8 | 8 |
9 | 9 |
10 def lambda_handler(event, context): | 10 def lambda_handler(event, context): |
11 for record in event['Records']: | 11 event_message = event['body']['message'] |
12 metadata = record['s3'] | 12 object_key = event_message['objectKey'] |
13 bucket_name = metadata['bucket']['name'] | 13 bucket_name = event_message['bucketName'] |
14 object_key = metadata['object']['key'] | |
15 | 14 |
16 doc = textract_client.analyze_document( | 15 # Download file from s3 |
17 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, | 16 s3_client.download_file(bucket_name, object_key, '/tmp/document.json') |
18 FeatureTypes=['TABLES'] | |
19 ) | |
20 | 17 |
21 # Analyze document | 18 with open('/tmp/document.json') as f: |
22 result = defaultdict(dict) | 19 doc = json.load(f) |
23 blocks = doc['Blocks'] | |
24 | 20 |
25 # Get format | 21 # Analyze document |
26 lines = filter_blocks(blocks, 'BlockType', 'LINE') | 22 result = defaultdict(dict) |
27 for line in lines: | 23 blocks = doc['Blocks'] |
28 amount_format = get_format(line['Text']) | |
29 result['format'] = amount_format | |
30 if amount_format: | |
31 break | |
32 | 24 |
33 # Find dates value and position | 25 # Get format |
34 data = defaultdict(dict) | 26 lines = filter_blocks(blocks, 'BlockType', 'LINE') |
35 cells = filter_blocks(blocks, 'BlockType', 'CELL') | 27 for line in lines: |
36 for cell in cells: | 28 amount_format = get_format(line['Text']) |
37 if not 'Relationships' in cell: | 29 result['format'] = amount_format |
38 continue | 30 if amount_format: |
31 break | |
39 | 32 |
40 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] | 33 # Find dates value and position |
34 data = defaultdict(dict) | |
35 cells = filter_blocks(blocks, 'BlockType', 'CELL') | |
36 for cell in cells: | |
37 if not 'Relationships' in cell: | |
38 continue | |
41 | 39 |
42 # Get `Text` from `CELL` block | 40 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] |
43 cell_text = '' | |
44 for index, child_id in enumerate(child_ids): | |
45 word_block = filter_blocks(blocks, 'Id', child_id)[0] | |
46 cell_text += word_block['Text'] | |
47 | 41 |
48 if index < len(child_ids) - 1: | 42 # Get `Text` from `CELL` block |
49 cell_text += '_' | 43 cell_text = '' |
44 for index, child_id in enumerate(child_ids): | |
45 word_block = filter_blocks(blocks, 'Id', child_id)[0] | |
46 cell_text += word_block['Text'] | |
50 | 47 |
51 # Verify if `Text` could be a valid date | 48 if index < len(child_ids) - 1: |
52 date_string = is_date(cell_text) | 49 cell_text += '_' |
53 if date_string: | |
54 cell_text = date_string | |
55 result['dateRow'] = cell['RowIndex'] | |
56 result['dateColumns'][cell['ColumnIndex']] = date_string | |
57 | 50 |
58 cell_row_index = cell['RowIndex'] | 51 # Verify if `Text` could be a valid date |
59 cell_column_index = cell['ColumnIndex'] | 52 date_string = is_date(cell_text) |
60 data[cell_row_index][cell_column_index] = clean(cell_text) | 53 if date_string: |
54 cell_text = date_string | |
55 result['dateRow'] = cell['RowIndex'] | |
56 result['dateColumns'][cell['ColumnIndex']] = date_string | |
61 | 57 |
62 # Delete unused row and columns | 58 cell_row_index = cell['RowIndex'] |
63 for row_index in list(data.keys()): | 59 cell_column_index = cell['ColumnIndex'] |
64 if row_index > result['dateRow']: | 60 data[cell_row_index][cell_column_index] = clean(cell_text) |
65 row = data[row_index] | |
66 for column_index in list(row.keys()): | |
67 if column_index not in result['dateColumns'] and column_index != 1: | |
68 del row[column_index] | |
69 | 61 |
70 if len(row) > 1: | 62 try: |
71 result['data'][row_index] = row | 63 data[cell_row_index]['type'] = cell['EntityTypes'] |
64 except KeyError: | |
65 pass | |
72 | 66 |
73 print(f'RESULT: {result}') | 67 # Delete unused row and columns |
68 for row_index in list(data.keys()): | |
69 row = data[row_index] | |
70 for column_index in list(row.keys()): | |
71 if column_index not in result['dateColumns'] \ | |
72 and column_index != 1 and column_index != 'type': | |
73 del row[column_index] | |
74 | |
75 if len(row) > 1: | |
76 result['data'][row_index] = row | |
77 | |
78 filename = object_key.replace('analyzed/', 'processed/') | |
79 data_string = json.dumps(result, indent=2, default=str) | |
80 | |
81 s3_client.put_object( | |
82 Bucket=bucket_name, | |
83 Key=filename, | |
84 Body=data_string | |
85 ) | |
74 | 86 |
75 return { | 87 return { |
76 "statusCode": 200, | 88 "statusCode": 200, |
77 "body": json.dumps({ | 89 "body": { |
78 "message": "ok" | 90 "message": { |
79 }), | 91 "objectKey": filename, |
92 "bucketName": bucket_name | |
93 } | |
94 }, | |
80 } | 95 } |
81 | 96 |
82 | 97 |
83 def filter_blocks(blocks, block_key, block_value): | 98 def filter_blocks(blocks, block_key, block_value): |
84 """ | 99 """ |