annotate process_document/app.py @ 2:ef8a4d95755a

add aws sam project
author Dennis C. M. <dennis@denniscm.com>
date Thu, 01 Jun 2023 18:51:18 +0100
parents
children 2e5f3664f3e4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
1 import json
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
2 import boto3
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
3 from datetime import datetime
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
4 from collections import defaultdict
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
5
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
6 s3_client = boto3.client('s3')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
7 textract_client = boto3.client('textract')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
8
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
9
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
10 def lambda_handler(event, context):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
11 for record in event['Records']:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
12 metadata = record['s3']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
13 bucket_name = metadata['bucket']['name']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
14 object_key = metadata['object']['key']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
15
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
16 doc = textract_client.analyze_document(
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
17 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
18 FeatureTypes=['TABLES']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
19 )
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
20
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
21 # Analyze document
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
22 result = defaultdict(dict)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
23 blocks = doc['Blocks']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
24
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
25 # Get format
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
26 lines = filter_blocks(blocks, 'BlockType', 'LINE')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
27 for line in lines:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
28 amount_format = get_format(line['Text'])
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
29 result['format'] = amount_format
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
30 if amount_format:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
31 break
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
32
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
33 # Find dates value and position
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
34 data = defaultdict(dict)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
35 cells = filter_blocks(blocks, 'BlockType', 'CELL')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
36 for cell in cells:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
37 if not 'Relationships' in cell:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
38 continue
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
39
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
40 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
41
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
42 # Get `Text` from `CELL` block
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
43 cell_text = ''
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
44 for index, child_id in enumerate(child_ids):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
45 word_block = filter_blocks(blocks, 'Id', child_id)[0]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
46 cell_text += word_block['Text']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
47
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
48 if index < len(child_ids) - 1:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
49 cell_text += '_'
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
50
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
51 # Verify if `Text` could be a valid date
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
52 date_string = is_date(cell_text)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
53 if date_string:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
54 cell_text = date_string
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
55 result['dateRow'] = cell['RowIndex']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
56 result['dateColumns'][cell['ColumnIndex']] = date_string
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
57
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
58 cell_row_index = cell['RowIndex']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
59 cell_column_index = cell['ColumnIndex']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
60 data[cell_row_index][cell_column_index] = clean(cell_text)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
61
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
62 # Delete unused row and columns
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
63 for row_index in list(data.keys()):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
64 if row_index > result['dateRow']:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
65 row = data[row_index]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
66 for column_index in list(row.keys()):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
67 if column_index not in result['dateColumns'] and column_index != 1:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
68 del row[column_index]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
69
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
70 if len(row) > 1:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
71 result['data'][row_index] = row
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
72
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
73 print(f'RESULT: {result}')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
74
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
75 return {
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
76 "statusCode": 200,
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
77 "body": json.dumps({
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
78 "message": "ok"
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
79 }),
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
80 }
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
81
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
82
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
83 def filter_blocks(blocks, block_key, block_value):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
84 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
85 Extract a block by key-value from array of blocks
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
86 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
87
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
88 return [block for block in blocks if block[block_key] == block_value]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
89
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
90
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
91 def is_date(string_date):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
92 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
93 Verify if a string could be a date.
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
94
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
95 -> Funciona pero es un desastre <-
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
96 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
97
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
98 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
99
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
100 for format_allowed in formats_allowed:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
101 try:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
102 date = datetime.strptime(string_date, format_allowed)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
103
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
104 if date.year > datetime.now().year or date.year < 1900:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
105 return # Fecha fuera de rango
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
106
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
107 return date.strftime("%Y")
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
108 except ValueError:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
109
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
110 # Try removing characters from the beginning and end
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
111 options = [string_date[:-1], string_date[1:], string_date[1:-1]]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
112 for option in options:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
113 try:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
114 date = datetime.strptime(option, format_allowed)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
115
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
116 if date.year > datetime.now().year or date.year < 1900:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
117 return # Fecha fuera de rango
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
118
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
119 return date.strftime("%Y")
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
120 except ValueError:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
121 continue
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
122
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
123 return
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
124
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
125
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
126 def get_format(phrase):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
127 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
128 Given a phrase verify if it is specified the amount format
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
129 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
130
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
131 amount_formats = ['thousand', 'million', 'billion']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
132
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
133 for amount_format in amount_formats:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
134 plural_amount_format = f'{amount_format}s'
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
135
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
136 if amount_format in phrase or plural_amount_format in phrase:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
137 return amount_format
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
138
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
139
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
140 def clean(text):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
141 """"
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
142 Remove bad characters from word
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
143 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
144
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
145 characters = ['.', ',', '-', ' ']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
146
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
147 for character in characters:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
148 text = text.replace(character, '')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
149
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
150 return text.lower()