Mercurial > public > finance-parser
diff process_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | ef8a4d95755a |
children | 9005b7590008 |
line wrap: on
line diff
--- a/process_document/app.py Thu Jun 01 18:51:18 2023 +0100 +++ b/process_document/app.py Fri Jun 02 20:12:29 2023 +0100 @@ -3,80 +3,95 @@ from datetime import datetime from collections import defaultdict + s3_client = boto3.client('s3') -textract_client = boto3.client('textract') def lambda_handler(event, context): - for record in event['Records']: - metadata = record['s3'] - bucket_name = metadata['bucket']['name'] - object_key = metadata['object']['key'] + event_message = event['body']['message'] + object_key = event_message['objectKey'] + bucket_name = event_message['bucketName'] + + # Download file from s3 + s3_client.download_file(bucket_name, object_key, '/tmp/document.json') - doc = textract_client.analyze_document( - Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, - FeatureTypes=['TABLES'] - ) + with open('/tmp/document.json') as f: + doc = json.load(f) - # Analyze document - result = defaultdict(dict) - blocks = doc['Blocks'] + # Analyze document + result = defaultdict(dict) + blocks = doc['Blocks'] - # Get format - lines = filter_blocks(blocks, 'BlockType', 'LINE') - for line in lines: - amount_format = get_format(line['Text']) - result['format'] = amount_format - if amount_format: - break + # Get format + lines = filter_blocks(blocks, 'BlockType', 'LINE') + for line in lines: + amount_format = get_format(line['Text']) + result['format'] = amount_format + if amount_format: + break - # Find dates value and position - data = defaultdict(dict) - cells = filter_blocks(blocks, 'BlockType', 'CELL') - for cell in cells: - if not 'Relationships' in cell: - continue + # Find dates value and position + data = defaultdict(dict) + cells = filter_blocks(blocks, 'BlockType', 'CELL') + for cell in cells: + if not 'Relationships' in cell: + continue - child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] + child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0] + + # Get `Text` from `CELL` block + cell_text = '' + for index, child_id in enumerate(child_ids): + word_block = filter_blocks(blocks, 'Id', child_id)[0] + cell_text += word_block['Text'] - # Get `Text` from `CELL` block - cell_text = '' - for index, child_id in enumerate(child_ids): - word_block = filter_blocks(blocks, 'Id', child_id)[0] - cell_text += word_block['Text'] + if index < len(child_ids) - 1: + cell_text += '_' - if index < len(child_ids) - 1: - cell_text += '_' + # Verify if `Text` could be a valid date + date_string = is_date(cell_text) + if date_string: + cell_text = date_string + result['dateRow'] = cell['RowIndex'] + result['dateColumns'][cell['ColumnIndex']] = date_string - # Verify if `Text` could be a valid date - date_string = is_date(cell_text) - if date_string: - cell_text = date_string - result['dateRow'] = cell['RowIndex'] - result['dateColumns'][cell['ColumnIndex']] = date_string + cell_row_index = cell['RowIndex'] + cell_column_index = cell['ColumnIndex'] + data[cell_row_index][cell_column_index] = clean(cell_text) + + try: + data[cell_row_index]['type'] = cell['EntityTypes'] + except KeyError: + pass - cell_row_index = cell['RowIndex'] - cell_column_index = cell['ColumnIndex'] - data[cell_row_index][cell_column_index] = clean(cell_text) + # Delete unused row and columns + for row_index in list(data.keys()): + row = data[row_index] + for column_index in list(row.keys()): + if column_index not in result['dateColumns'] \ + and column_index != 1 and column_index != 'type': + del row[column_index] - # Delete unused row and columns - for row_index in list(data.keys()): - if row_index > result['dateRow']: - row = data[row_index] - for column_index in list(row.keys()): - if column_index not in result['dateColumns'] and column_index != 1: - del row[column_index] + if len(row) > 1: + result['data'][row_index] = row + + filename = object_key.replace('analyzed/', 'processed/') + data_string = json.dumps(result, indent=2, default=str) - if len(row) > 1: - result['data'][row_index] = row - - print(f'RESULT: {result}') + s3_client.put_object( + Bucket=bucket_name, + Key=filename, + Body=data_string + ) return { "statusCode": 200, - "body": json.dumps({ - "message": "ok" - }), + "body": { + "message": { + "objectKey": filename, + "bucketName": bucket_name + } + }, }