Mercurial > public > finance-parser
annotate upload_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | |
children | 9005b7590008 |
rev | line source |
---|---|
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
1 import json |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
2 import boto3 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
3 import re |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
4 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
5 s3_client = boto3.client('s3') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
6 dynamodb = boto3.resource('dynamodb') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
7 table = dynamodb.Table('FinanceParser') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
8 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
9 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
10 def lambda_handler(event, context): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
11 event_message = event['body']['message'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
12 object_key = event_message['objectKey'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
13 bucket_name = event_message['bucketName'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
14 company_ticker = re.search('processed/(.*)_', object_key).group(1) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
15 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
16 # Download file from s3 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
17 s3_client.download_file(bucket_name, object_key, '/tmp/document.json') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
18 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
19 with open('/tmp/document.json') as f: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
20 doc = json.load(f) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
21 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
22 for dateColumn, date in doc['dateColumns'].items(): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
23 for row_index, account in doc['data'].items(): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
24 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
25 try: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
26 column_types = account['type'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
27 except KeyError: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
28 column_types = [] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
29 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
30 """ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
31 The following statement avoids getting a `2020` as the value |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
32 of `ASSETS`. |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
33 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
34 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
35 | ASSETS | 2020 | 2019 | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
36 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
37 | ASSETS_ACCOUNT_1 | | | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
38 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
39 | ASSETS_ACCOUNT_2 | | | |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
40 +------------------+------+------+ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
41 """ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
42 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
43 account_value = account[dateColumn] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
44 if 'COLUMN_HEADER' in column_types and date == account_value: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
45 account_value = '' |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
46 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
47 with table.batch_writer() as batch: |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
48 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
49 # pk -> item_type#company_ticker |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
50 # sk -> date#row_index |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
51 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
52 batch.put_item( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
53 Item={ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
54 'pk': f'balance#{company_ticker}', |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
55 'sk': f'{date}#{row_index}', |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
56 'account_name': account['1'], |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
57 'account_value': account_value, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
58 'column_types': column_types |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
59 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
60 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
61 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
62 # pk -> item_type#company_ticker |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
63 # sk -> date |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
64 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
65 table.put_item( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
66 Item={ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
67 'pk': f'file#{company_ticker}', |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
68 'sk': f"{date}", |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
69 'filename': object_key.replace('processed/', '') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
70 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
71 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
72 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
73 return { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
74 "statusCode": 200, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
75 "body": json.dumps({ |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
76 "message": "ok" |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
77 }), |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
78 } |