Mercurial > public > finance-parser
diff analyze_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | |
children | 9005b7590008 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/analyze_document/app.py Fri Jun 02 20:12:29 2023 +0100 @@ -0,0 +1,44 @@ +import json +import boto3 +import uuid +import re + + +textract_client = boto3.client('textract') +s3_client = boto3.client('s3') + + +def lambda_handler(event, context): + event_detail = event['detail'] + bucket_name = event_detail['bucket']['name'] + object_key = event_detail['object']['key'] + company_ticker = re.search('unprocessed/(.*).pdf', object_key).group(1) + + data_dict = textract_client.analyze_document( + Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, + FeatureTypes=['TABLES'] + ) + + data_string = json.dumps(data_dict, indent=2, default=str) + filename = f'{company_ticker}_{uuid.uuid4()}.json' + + s3_client.put_object( + Bucket=bucket_name, + Key=f'analyzed/{filename}', + Body=data_string + ) + + s3_client.delete_object( + Bucket=bucket_name, + Key=object_key + ) + + return { + "statusCode": 200, + "body": { + "message": { + "objectKey": f'analyzed/{filename}', + "bucketName": bucket_name + } + }, + }