Mercurial > public > finance-parser
annotate upload_document/app.py @ 15:0a5a4cbaa6d6
Change info files
author | Dennis <dennis@denniscm.com> |
---|---|
date | Fri, 11 Aug 2023 17:04:18 +0000 |
parents | d4c4cd4760fa |
children |
rev | line source |
---|---|
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
1 import json |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
2 import boto3 |
4 | 3 |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
4 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
5 s3_client = boto3.client('s3') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
6 dynamodb = boto3.resource('dynamodb') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
7 table = dynamodb.Table('FinanceParser') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
8 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
9 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
10 def lambda_handler(event, context): |
4 | 11 event_msg = event['body']['message'] |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
12 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
13 # Download file from s3 |
4 | 14 s3_client.download_file( |
15 event_msg['bucketName'], | |
16 event_msg['objectKey'], | |
17 '/tmp/document.json' | |
18 ) | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
19 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
20 with open('/tmp/document.json') as f: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
21 doc = json.load(f) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
22 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
23 for dateColumn, date in doc['dateColumns'].items(): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
24 for row_index, account in doc['data'].items(): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
25 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
26 try: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
27 column_types = account['type'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
28 except KeyError: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
29 column_types = [] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
30 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
31 """ |
4 | 32 Given: |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
33 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
34 | ASSETS | 2020 | 2019 | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
35 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
36 | ASSETS_ACCOUNT_1 | | | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
37 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
38 | ASSETS_ACCOUNT_2 | | | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
39 +------------------+------+------+ |
4 | 40 |
41 The following statement avoids getting `2020` as the value of `ASSETS`. | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
42 """ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
43 |
10 | 44 try: |
45 account_value = account[dateColumn] | |
46 except KeyError: | |
47 account_value = '' | |
48 | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
49 if 'COLUMN_HEADER' in column_types and date == account_value: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
50 account_value = '' |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
51 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
52 with table.batch_writer() as batch: |
11 | 53 try: |
54 account_name = account['1'] | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
55 |
11 | 56 # pk -> item_type#company_ticker |
57 # sk -> date#row_index | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
58 |
11 | 59 batch.put_item( |
60 Item={ | |
12 | 61 'pk': f"{event_msg['docType']}#{event_msg['companyTicker']}", |
11 | 62 'sk': f'{date}#{row_index}', |
63 'account_name': account_name, | |
64 'account_value': account_value, | |
65 'column_types': column_types, | |
66 'format': doc['format'] | |
67 } | |
68 ) | |
69 except KeyError: | |
70 pass | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
71 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
72 # pk -> item_type#company_ticker |
4 | 73 # sk -> date#filename |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
74 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
75 table.put_item( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
76 Item={ |
11 | 77 'pk': f"file#{event_msg['docType']}#{event_msg['companyTicker']}", |
4 | 78 'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}" |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
79 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
80 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
81 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
82 return { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
83 "statusCode": 200, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
84 "body": json.dumps({ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
85 "message": "ok" |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
86 }), |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
87 } |