Mercurial > public > finance-parser
diff upload_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | |
children | 9005b7590008 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/upload_document/app.py Fri Jun 02 20:12:29 2023 +0100 @@ -0,0 +1,78 @@ +import json +import boto3 +import re + +s3_client = boto3.client('s3') +dynamodb = boto3.resource('dynamodb') +table = dynamodb.Table('FinanceParser') + + +def lambda_handler(event, context): + event_message = event['body']['message'] + object_key = event_message['objectKey'] + bucket_name = event_message['bucketName'] + company_ticker = re.search('processed/(.*)_', object_key).group(1) + + # Download file from s3 + s3_client.download_file(bucket_name, object_key, '/tmp/document.json') + + with open('/tmp/document.json') as f: + doc = json.load(f) + + for dateColumn, date in doc['dateColumns'].items(): + for row_index, account in doc['data'].items(): + + try: + column_types = account['type'] + except KeyError: + column_types = [] + + """ + The following statement avoids getting a `2020` as the value + of `ASSETS`. + + +------------------+------+------+ + | ASSETS | 2020 | 2019 | + +------------------+------+------+ + | ASSETS_ACCOUNT_1 | | | + +------------------+------+------+ + | ASSETS_ACCOUNT_2 | | | + +------------------+------+------+ + """ + + account_value = account[dateColumn] + if 'COLUMN_HEADER' in column_types and date == account_value: + account_value = '' + + with table.batch_writer() as batch: + + # pk -> item_type#company_ticker + # sk -> date#row_index + + batch.put_item( + Item={ + 'pk': f'balance#{company_ticker}', + 'sk': f'{date}#{row_index}', + 'account_name': account['1'], + 'account_value': account_value, + 'column_types': column_types + } + ) + + # pk -> item_type#company_ticker + # sk -> date + + table.put_item( + Item={ + 'pk': f'file#{company_ticker}', + 'sk': f"{date}", + 'filename': object_key.replace('processed/', '') + } + ) + + return { + "statusCode": 200, + "body": json.dumps({ + "message": "ok" + }), + }