Mercurial > public > finance-parser
annotate upload_document/app.py @ 4:9005b7590008
state machine working
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 10:13:43 +0100 |
parents | 2e5f3664f3e4 |
children | d15ccf5f1373 |
rev | line source |
---|---|
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
1 import json |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
2 import boto3 |
4 | 3 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
4 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
5 s3_client = boto3.client('s3') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
6 dynamodb = boto3.resource('dynamodb') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
7 table = dynamodb.Table('FinanceParser') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
8 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
9 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
10 def lambda_handler(event, context): |
4 | 11 event_msg = event['body']['message'] |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
12 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
13 # Download file from s3 |
4 | 14 s3_client.download_file( |
15 event_msg['bucketName'], | |
16 event_msg['objectKey'], | |
17 '/tmp/document.json' | |
18 ) | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
19 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
20 with open('/tmp/document.json') as f: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
21 doc = json.load(f) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
22 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
23 for dateColumn, date in doc['dateColumns'].items(): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
24 for row_index, account in doc['data'].items(): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
25 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
26 try: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
27 column_types = account['type'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
28 except KeyError: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
29 column_types = [] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
30 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
31 """ |
4 | 32 Given: |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
33 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
34 | ASSETS | 2020 | 2019 | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
35 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
36 | ASSETS_ACCOUNT_1 | | | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
37 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
38 | ASSETS_ACCOUNT_2 | | | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
39 +------------------+------+------+ |
4 | 40 |
41 The following statement avoids getting `2020` as the value of `ASSETS`. | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
42 """ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
43 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
44 account_value = account[dateColumn] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
45 if 'COLUMN_HEADER' in column_types and date == account_value: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
46 account_value = '' |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
47 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
48 with table.batch_writer() as batch: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
49 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
50 # pk -> item_type#company_ticker |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
51 # sk -> date#row_index |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
52 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
53 batch.put_item( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
54 Item={ |
4 | 55 'pk': f"balance#{event_msg['companyTicker']}", |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
56 'sk': f'{date}#{row_index}', |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
57 'account_name': account['1'], |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
58 'account_value': account_value, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
59 'column_types': column_types |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
60 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
61 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
62 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
63 # pk -> item_type#company_ticker |
4 | 64 # sk -> date#filename |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
65 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
66 table.put_item( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
67 Item={ |
4 | 68 'pk': f"file#{event_msg['companyTicker']}", |
69 'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}" | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
70 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
71 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
72 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
73 return { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
74 "statusCode": 200, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
75 "body": json.dumps({ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
76 "message": "ok" |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
77 }), |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
78 } |