diff analyze_document/app.py @ 3:2e5f3664f3e4

documents analyzer almost finished
author Dennis C. M. <dennis@denniscm.com>
date Fri, 02 Jun 2023 20:12:29 +0100
parents
children 9005b7590008
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/analyze_document/app.py	Fri Jun 02 20:12:29 2023 +0100
@@ -0,0 +1,44 @@
+import json
+import boto3
+import uuid
+import re
+
+
+textract_client = boto3.client('textract')
+s3_client = boto3.client('s3')
+
+
+def lambda_handler(event, context):
+    event_detail = event['detail']
+    bucket_name = event_detail['bucket']['name']
+    object_key = event_detail['object']['key']
+    company_ticker = re.search('unprocessed/(.*).pdf', object_key).group(1)
+
+    data_dict = textract_client.analyze_document(
+        Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
+        FeatureTypes=['TABLES']
+    )
+
+    data_string = json.dumps(data_dict, indent=2, default=str)
+    filename = f'{company_ticker}_{uuid.uuid4()}.json'
+
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key=f'analyzed/{filename}',
+        Body=data_string
+    )
+
+    s3_client.delete_object(
+        Bucket=bucket_name,
+        Key=object_key
+    )
+
+    return {
+        "statusCode": 200,
+        "body": {
+            "message": {
+                "objectKey": f'analyzed/{filename}',
+                "bucketName": bucket_name
+            }
+        },
+    }