Mercurial > public > finance-parser
annotate analyze_document/app.py @ 6:d15ccf5f1373
fix bug clean_text
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 17:12:18 +0100 |
parents | 9005b7590008 |
children | bf19235a9636 |
rev | line source |
---|---|
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
1 import json |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
2 import boto3 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
3 import uuid |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
4 import re |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
5 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
6 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
7 textract_client = boto3.client('textract') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
8 s3_client = boto3.client('s3') |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
9 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
10 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
11 def lambda_handler(event, context): |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
12 event_detail = event['detail'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
13 bucket_name = event_detail['bucket']['name'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
14 object_key = event_detail['object']['key'] |
4 | 15 |
16 company_ticker = re.search('unprocessed/(.*)_', object_key).group(1) | |
17 doc_type = re.search(f'unprocessed/{company_ticker}_(.*).pdf', object_key).group(1) | |
6 | 18 file_id = str(uuid.uuid4()) |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
19 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
20 data_dict = textract_client.analyze_document( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
21 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
22 FeatureTypes=['TABLES'] |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
23 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
24 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
25 data_string = json.dumps(data_dict, indent=2, default=str) |
4 | 26 filename = f'{company_ticker}_{doc_type}_{file_id}.json' |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
27 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
28 s3_client.put_object( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
29 Bucket=bucket_name, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
30 Key=f'analyzed/{filename}', |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
31 Body=data_string |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
32 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
33 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
34 s3_client.delete_object( |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
35 Bucket=bucket_name, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
36 Key=object_key |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
37 ) |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
38 |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
39 return { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
40 "statusCode": 200, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
41 "body": { |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
42 "message": { |
4 | 43 "companyTicker": company_ticker, |
44 "docType": doc_type, | |
45 "fileId": file_id, | |
46 "fileName": filename, | |
3
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
47 "objectKey": f'analyzed/{filename}', |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
48 "bucketName": bucket_name |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
49 } |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
50 }, |
2e5f3664f3e4
documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents:
diff
changeset
|
51 } |