annotate upload_document/app.py @ 3:2e5f3664f3e4

documents analyzer almost finished
author Dennis C. M. <dennis@denniscm.com>
date Fri, 02 Jun 2023 20:12:29 +0100
parents
children 9005b7590008
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
1 import json
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
2 import boto3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
3 import re
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
4
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
5 s3_client = boto3.client('s3')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
6 dynamodb = boto3.resource('dynamodb')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
7 table = dynamodb.Table('FinanceParser')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
8
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
9
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
10 def lambda_handler(event, context):
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
11 event_message = event['body']['message']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
12 object_key = event_message['objectKey']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
13 bucket_name = event_message['bucketName']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
14 company_ticker = re.search('processed/(.*)_', object_key).group(1)
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
15
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
16 # Download file from s3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
17 s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
18
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
19 with open('/tmp/document.json') as f:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
20 doc = json.load(f)
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
21
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
22 for dateColumn, date in doc['dateColumns'].items():
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
23 for row_index, account in doc['data'].items():
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
24
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
25 try:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
26 column_types = account['type']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
27 except KeyError:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
28 column_types = []
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
29
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
30 """
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
31 The following statement avoids getting a `2020` as the value
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
32 of `ASSETS`.
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
33
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
34 +------------------+------+------+
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
35 | ASSETS | 2020 | 2019 |
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
36 +------------------+------+------+
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
37 | ASSETS_ACCOUNT_1 | | |
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
38 +------------------+------+------+
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
39 | ASSETS_ACCOUNT_2 | | |
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
40 +------------------+------+------+
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
41 """
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
42
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
43 account_value = account[dateColumn]
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
44 if 'COLUMN_HEADER' in column_types and date == account_value:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
45 account_value = ''
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
46
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
47 with table.batch_writer() as batch:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
48
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
49 # pk -> item_type#company_ticker
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
50 # sk -> date#row_index
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
51
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
52 batch.put_item(
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
53 Item={
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
54 'pk': f'balance#{company_ticker}',
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
55 'sk': f'{date}#{row_index}',
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
56 'account_name': account['1'],
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
57 'account_value': account_value,
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
58 'column_types': column_types
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
59 }
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
60 )
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
61
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
62 # pk -> item_type#company_ticker
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
63 # sk -> date
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
64
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
65 table.put_item(
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
66 Item={
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
67 'pk': f'file#{company_ticker}',
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
68 'sk': f"{date}",
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
69 'filename': object_key.replace('processed/', '')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
70 }
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
71 )
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
72
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
73 return {
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
74 "statusCode": 200,
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
75 "body": json.dumps({
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
76 "message": "ok"
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
77 }),
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
78 }