comparison process_document/app.py @ 3:2e5f3664f3e4

documents analyzer almost finished
author Dennis C. M. <dennis@denniscm.com>
date Fri, 02 Jun 2023 20:12:29 +0100
parents ef8a4d95755a
children 9005b7590008
comparison
equal deleted inserted replaced
2:ef8a4d95755a 3:2e5f3664f3e4
1 import json 1 import json
2 import boto3 2 import boto3
3 from datetime import datetime 3 from datetime import datetime
4 from collections import defaultdict 4 from collections import defaultdict
5 5
6
6 s3_client = boto3.client('s3') 7 s3_client = boto3.client('s3')
7 textract_client = boto3.client('textract')
8 8
9 9
10 def lambda_handler(event, context): 10 def lambda_handler(event, context):
11 for record in event['Records']: 11 event_message = event['body']['message']
12 metadata = record['s3'] 12 object_key = event_message['objectKey']
13 bucket_name = metadata['bucket']['name'] 13 bucket_name = event_message['bucketName']
14 object_key = metadata['object']['key']
15 14
16 doc = textract_client.analyze_document( 15 # Download file from s3
17 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, 16 s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
18 FeatureTypes=['TABLES']
19 )
20 17
21 # Analyze document 18 with open('/tmp/document.json') as f:
22 result = defaultdict(dict) 19 doc = json.load(f)
23 blocks = doc['Blocks']
24 20
25 # Get format 21 # Analyze document
26 lines = filter_blocks(blocks, 'BlockType', 'LINE') 22 result = defaultdict(dict)
27 for line in lines: 23 blocks = doc['Blocks']
28 amount_format = get_format(line['Text'])
29 result['format'] = amount_format
30 if amount_format:
31 break
32 24
33 # Find dates value and position 25 # Get format
34 data = defaultdict(dict) 26 lines = filter_blocks(blocks, 'BlockType', 'LINE')
35 cells = filter_blocks(blocks, 'BlockType', 'CELL') 27 for line in lines:
36 for cell in cells: 28 amount_format = get_format(line['Text'])
37 if not 'Relationships' in cell: 29 result['format'] = amount_format
38 continue 30 if amount_format:
31 break
39 32
40 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] 33 # Find dates value and position
34 data = defaultdict(dict)
35 cells = filter_blocks(blocks, 'BlockType', 'CELL')
36 for cell in cells:
37 if not 'Relationships' in cell:
38 continue
41 39
42 # Get `Text` from `CELL` block 40 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0]
43 cell_text = ''
44 for index, child_id in enumerate(child_ids):
45 word_block = filter_blocks(blocks, 'Id', child_id)[0]
46 cell_text += word_block['Text']
47 41
48 if index < len(child_ids) - 1: 42 # Get `Text` from `CELL` block
49 cell_text += '_' 43 cell_text = ''
44 for index, child_id in enumerate(child_ids):
45 word_block = filter_blocks(blocks, 'Id', child_id)[0]
46 cell_text += word_block['Text']
50 47
51 # Verify if `Text` could be a valid date 48 if index < len(child_ids) - 1:
52 date_string = is_date(cell_text) 49 cell_text += '_'
53 if date_string:
54 cell_text = date_string
55 result['dateRow'] = cell['RowIndex']
56 result['dateColumns'][cell['ColumnIndex']] = date_string
57 50
58 cell_row_index = cell['RowIndex'] 51 # Verify if `Text` could be a valid date
59 cell_column_index = cell['ColumnIndex'] 52 date_string = is_date(cell_text)
60 data[cell_row_index][cell_column_index] = clean(cell_text) 53 if date_string:
54 cell_text = date_string
55 result['dateRow'] = cell['RowIndex']
56 result['dateColumns'][cell['ColumnIndex']] = date_string
61 57
62 # Delete unused row and columns 58 cell_row_index = cell['RowIndex']
63 for row_index in list(data.keys()): 59 cell_column_index = cell['ColumnIndex']
64 if row_index > result['dateRow']: 60 data[cell_row_index][cell_column_index] = clean(cell_text)
65 row = data[row_index]
66 for column_index in list(row.keys()):
67 if column_index not in result['dateColumns'] and column_index != 1:
68 del row[column_index]
69 61
70 if len(row) > 1: 62 try:
71 result['data'][row_index] = row 63 data[cell_row_index]['type'] = cell['EntityTypes']
64 except KeyError:
65 pass
72 66
73 print(f'RESULT: {result}') 67 # Delete unused row and columns
68 for row_index in list(data.keys()):
69 row = data[row_index]
70 for column_index in list(row.keys()):
71 if column_index not in result['dateColumns'] \
72 and column_index != 1 and column_index != 'type':
73 del row[column_index]
74
75 if len(row) > 1:
76 result['data'][row_index] = row
77
78 filename = object_key.replace('analyzed/', 'processed/')
79 data_string = json.dumps(result, indent=2, default=str)
80
81 s3_client.put_object(
82 Bucket=bucket_name,
83 Key=filename,
84 Body=data_string
85 )
74 86
75 return { 87 return {
76 "statusCode": 200, 88 "statusCode": 200,
77 "body": json.dumps({ 89 "body": {
78 "message": "ok" 90 "message": {
79 }), 91 "objectKey": filename,
92 "bucketName": bucket_name
93 }
94 },
80 } 95 }
81 96
82 97
83 def filter_blocks(blocks, block_key, block_value): 98 def filter_blocks(blocks, block_key, block_value):
84 """ 99 """