diff upload_document/app.py @ 3:2e5f3664f3e4

documents analyzer almost finished
author Dennis C. M. <dennis@denniscm.com>
date Fri, 02 Jun 2023 20:12:29 +0100
parents
children 9005b7590008
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/upload_document/app.py	Fri Jun 02 20:12:29 2023 +0100
@@ -0,0 +1,78 @@
+import json
+import boto3
+import re
+
+s3_client = boto3.client('s3')
+dynamodb = boto3.resource('dynamodb')
+table = dynamodb.Table('FinanceParser')
+
+
+def lambda_handler(event, context):
+    event_message = event['body']['message']
+    object_key = event_message['objectKey']
+    bucket_name = event_message['bucketName']
+    company_ticker = re.search('processed/(.*)_', object_key).group(1)
+
+    # Download file from s3
+    s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
+
+    with open('/tmp/document.json') as f:
+        doc = json.load(f)
+
+    for dateColumn, date in doc['dateColumns'].items():
+        for row_index, account in doc['data'].items():
+
+            try:
+                column_types = account['type']
+            except KeyError:
+                column_types = []
+
+            """
+            The following statement avoids getting a `2020` as the value 
+            of `ASSETS`.
+            
+            +------------------+------+------+
+            | ASSETS           | 2020 | 2019 |
+            +------------------+------+------+
+            | ASSETS_ACCOUNT_1 |      |      |
+            +------------------+------+------+
+            | ASSETS_ACCOUNT_2 |      |      |
+            +------------------+------+------+
+            """
+
+            account_value = account[dateColumn]
+            if 'COLUMN_HEADER' in column_types and date == account_value:
+                account_value = ''
+
+            with table.batch_writer() as batch:
+
+                # pk -> item_type#company_ticker
+                # sk -> date#row_index
+
+                batch.put_item(
+                    Item={
+                        'pk': f'balance#{company_ticker}',
+                        'sk': f'{date}#{row_index}',
+                        'account_name': account['1'],
+                        'account_value': account_value,
+                        'column_types': column_types
+                    }
+                )
+
+        # pk -> item_type#company_ticker
+        # sk -> date
+
+        table.put_item(
+            Item={
+                'pk': f'file#{company_ticker}',
+                'sk': f"{date}",
+                'filename': object_key.replace('processed/', '')
+            }
+        )
+
+    return {
+        "statusCode": 200,
+        "body": json.dumps({
+            "message": "ok"
+        }),
+    }