annotate analyze_document/app.py @ 14:e77f773aeb46

add deployment metadata
author Dennis C. M. <dennis@denniscm.com>
date Sat, 10 Jun 2023 14:52:03 +0100
parents 2350662483a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
1 import json
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
2 import boto3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
3 import uuid
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
4 import re
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
5
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
6
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
7 textract_client = boto3.client('textract')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
8 s3_client = boto3.client('s3')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
9
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
10
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
11 def lambda_handler(event, context):
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
12 event_detail = event['detail']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
13 bucket_name = event_detail['bucket']['name']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
14 object_key = event_detail['object']['key']
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
15
10
2350662483a3 fix minor bugs
Dennis C. M. <dennis@denniscm.com>
parents: 9
diff changeset
16 company_ticker = re.search('unprocessed/(.*?)_', object_key).group(1)
2350662483a3 fix minor bugs
Dennis C. M. <dennis@denniscm.com>
parents: 9
diff changeset
17 doc_type = re.search(f'unprocessed/{company_ticker}_(.*?)_', object_key).group(1)
6
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
18 file_id = str(uuid.uuid4())
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
19
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
20 data_dict = textract_client.analyze_document(
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
21 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
22 FeatureTypes=['TABLES']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
23 )
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
24
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
25 data_string = json.dumps(data_dict, indent=2, default=str)
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
26 filename = f'{company_ticker}_{doc_type}_{file_id}.json'
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
27
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
28 s3_client.put_object(
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
29 Bucket=bucket_name,
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
30 Key=f'analyzed/{filename}',
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
31 Body=data_string
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
32 )
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
33
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
34 s3_client.delete_object(
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
35 Bucket=bucket_name,
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
36 Key=object_key
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
37 )
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
38
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
39 return {
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
40 "statusCode": 200,
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
41 "body": {
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
42 "message": {
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
43 "companyTicker": company_ticker,
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
44 "docType": doc_type,
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
45 "fileId": file_id,
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
46 "fileName": filename,
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
47 "objectKey": f'analyzed/{filename}',
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
48 "bucketName": bucket_name
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
49 }
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
50 },
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
51 }