finance-parser: process_document/app.py comparison

comparison process_document/app.py @ 3:2e5f3664f3e4

documents analyzer almost finished

author	Dennis C. M. <dennis@denniscm.com>
date	Fri, 02 Jun 2023 20:12:29 +0100
parents	ef8a4d95755a
children	9005b7590008

comparison

equal deleted inserted replaced

-:ef8a4d95755a
+:2e5f3664f3e4
 import json
 import boto3
 from datetime import datetime
 from collections import defaultdict
 s3_client = boto3.client('s3')
-textract_client = boto3.client('textract')
 def lambda_handler(event, context):
-for record in event['Records']:
+event_message = event['body']['message']
-metadata = record['s3']
+object_key = event_message['objectKey']
-bucket_name = metadata['bucket']['name']
+bucket_name = event_message['bucketName']
-object_key = metadata['object']['key']
-doc = textract_client.analyze_document(
+# Download file from s3
-Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
+s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
-FeatureTypes=['TABLES']
-)
-# Analyze document
+with open('/tmp/document.json') as f:
-result = defaultdict(dict)
+doc = json.load(f)
-blocks = doc['Blocks']
-# Get format
+# Analyze document
-lines = filter_blocks(blocks, 'BlockType', 'LINE')
+result = defaultdict(dict)
-for line in lines:
+blocks = doc['Blocks']
-amount_format = get_format(line['Text'])
-result['format'] = amount_format
-if amount_format:
-break
-# Find dates value and position
+# Get format
-data = defaultdict(dict)
+lines = filter_blocks(blocks, 'BlockType', 'LINE')
-cells = filter_blocks(blocks, 'BlockType', 'CELL')
+for line in lines:
-for cell in cells:
+amount_format = get_format(line['Text'])
-if not 'Relationships' in cell:
+result['format'] = amount_format
-continue
+if amount_format:
+break
-child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0]
+# Find dates value and position
+data = defaultdict(dict)
+cells = filter_blocks(blocks, 'BlockType', 'CELL')
+for cell in cells:
+if not 'Relationships' in cell:
+continue
-# Get `Text` from `CELL` block
+child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0]
-cell_text = ''
-for index, child_id in enumerate(child_ids):
-word_block = filter_blocks(blocks, 'Id', child_id)[0]
-cell_text += word_block['Text']
-if index < len(child_ids) - 1:
+# Get `Text` from `CELL` block
-cell_text += '_'
+cell_text = ''
+for index, child_id in enumerate(child_ids):
+word_block = filter_blocks(blocks, 'Id', child_id)[0]
+cell_text += word_block['Text']
-# Verify if `Text` could be a valid date
+if index < len(child_ids) - 1:
-date_string = is_date(cell_text)
+cell_text += '_'
-if date_string:
-cell_text = date_string
-result['dateRow'] = cell['RowIndex']
-result['dateColumns'][cell['ColumnIndex']] = date_string
-cell_row_index = cell['RowIndex']
+# Verify if `Text` could be a valid date
-cell_column_index = cell['ColumnIndex']
+date_string = is_date(cell_text)
-data[cell_row_index][cell_column_index] = clean(cell_text)
+if date_string:
+cell_text = date_string
+result['dateRow'] = cell['RowIndex']
+result['dateColumns'][cell['ColumnIndex']] = date_string
-# Delete unused row and columns
+cell_row_index = cell['RowIndex']
-for row_index in list(data.keys()):
+cell_column_index = cell['ColumnIndex']
-if row_index > result['dateRow']:
+data[cell_row_index][cell_column_index] = clean(cell_text)
-row = data[row_index]
-for column_index in list(row.keys()):
-if column_index not in result['dateColumns'] and column_index != 1:
-del row[column_index]
-if len(row) > 1:
+try:
-result['data'][row_index] = row
+data[cell_row_index]['type'] = cell['EntityTypes']
+except KeyError:
+pass
-print(f'RESULT: {result}')
+# Delete unused row and columns
+for row_index in list(data.keys()):
+row = data[row_index]
+for column_index in list(row.keys()):
+if column_index not in result['dateColumns'] \
+and column_index != 1 and column_index != 'type':
+del row[column_index]
+if len(row) > 1:
+result['data'][row_index] = row
+filename = object_key.replace('analyzed/', 'processed/')
+data_string = json.dumps(result, indent=2, default=str)
+s3_client.put_object(
+Bucket=bucket_name,
+Key=filename,
+Body=data_string
+)
 return {
 "statusCode": 200,
-"body": json.dumps({
+"body": {
-"message": "ok"
+"message": {
-}),
+"objectKey": filename,
+"bucketName": bucket_name
+}
+},
 }
 def filter_blocks(blocks, block_key, block_value):
 """

Mercurial > public > finance-parser

comparison process_document/app.py @ 3:2e5f3664f3e4