comparison process_document/app.py @ 4:9005b7590008

state machine working
author Dennis C. M. <dennis@denniscm.com>
date Mon, 05 Jun 2023 10:13:43 +0100
parents 2e5f3664f3e4
children d15ccf5f1373
comparison
equal deleted inserted replaced
3:2e5f3664f3e4 4:9005b7590008
6 6
7 s3_client = boto3.client('s3') 7 s3_client = boto3.client('s3')
8 8
9 9
10 def lambda_handler(event, context): 10 def lambda_handler(event, context):
11 event_message = event['body']['message'] 11 event_msg = event['body']['message']
12 object_key = event_message['objectKey']
13 bucket_name = event_message['bucketName']
14 12
15 # Download file from s3 13 # Download file from s3
16 s3_client.download_file(bucket_name, object_key, '/tmp/document.json') 14 s3_client.download_file(
15 event_msg['bucketName'],
16 event_msg['objectKey'],
17 '/tmp/document.json'
18 )
17 19
18 with open('/tmp/document.json') as f: 20 with open('/tmp/document.json') as f:
19 doc = json.load(f) 21 doc = json.load(f)
20 22
21 # Analyze document 23 # Analyze document
47 49
48 if index < len(child_ids) - 1: 50 if index < len(child_ids) - 1:
49 cell_text += '_' 51 cell_text += '_'
50 52
51 # Verify if `Text` could be a valid date 53 # Verify if `Text` could be a valid date
52 date_string = is_date(cell_text) 54 date_string = is_date(clean_text(cell_text, 'date'))
53 if date_string: 55 if date_string:
54 cell_text = date_string 56 cell_text = date_string
55 result['dateRow'] = cell['RowIndex'] 57 result['dateRow'] = cell['RowIndex']
56 result['dateColumns'][cell['ColumnIndex']] = date_string 58 result['dateColumns'][cell['ColumnIndex']] = date_string
57 59
58 cell_row_index = cell['RowIndex'] 60 cell_row_index = cell['RowIndex']
59 cell_column_index = cell['ColumnIndex'] 61 cell_column_index = cell['ColumnIndex']
60 data[cell_row_index][cell_column_index] = clean(cell_text) 62 data[cell_row_index][cell_column_index] = clean_text(cell_text)
61 63
62 try: 64 try:
63 data[cell_row_index]['type'] = cell['EntityTypes'] 65 data[cell_row_index]['type'] = cell['EntityTypes']
64 except KeyError: 66 except KeyError:
65 pass 67 pass
73 del row[column_index] 75 del row[column_index]
74 76
75 if len(row) > 1: 77 if len(row) > 1:
76 result['data'][row_index] = row 78 result['data'][row_index] = row
77 79
78 filename = object_key.replace('analyzed/', 'processed/') 80 object_key = event_msg['objectKey'].replace('analyzed/', 'processed/')
79 data_string = json.dumps(result, indent=2, default=str) 81 data_string = json.dumps(result, indent=2, default=str)
80 82
81 s3_client.put_object( 83 s3_client.put_object(
82 Bucket=bucket_name, 84 Bucket=event_msg['bucketName'],
83 Key=filename, 85 Key=object_key,
84 Body=data_string 86 Body=data_string
85 ) 87 )
86 88
87 return { 89 return {
88 "statusCode": 200, 90 "statusCode": 200,
89 "body": { 91 "body": {
90 "message": { 92 "message": {
91 "objectKey": filename, 93 "companyTicker": event_msg['companyTicker'],
92 "bucketName": bucket_name 94 "docType": event_msg['docType'],
95 "fileId": event_msg['fileId'],
96 "fileName": event_msg['fileName'],
97 "objectKey": object_key,
98 "bucketName": event_msg['bucketName']
93 } 99 }
94 }, 100 },
95 } 101 }
96 102
97 103
104 110
105 111
106 def is_date(string_date): 112 def is_date(string_date):
107 """ 113 """
108 Verify if a string could be a date. 114 Verify if a string could be a date.
109
110 -> Funciona pero es un desastre <-
111 """ 115 """
112 116
113 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'] 117 formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y']
114 118
115 for format_allowed in formats_allowed: 119 for format_allowed in formats_allowed:
116 try: 120 try:
117 date = datetime.strptime(string_date, format_allowed) 121 date = datetime.strptime(string_date, format_allowed)
118 122
119 if date.year > datetime.now().year or date.year < 1900: 123 if date.year > datetime.now().year or date.year < 1900:
120 return # Fecha fuera de rango 124 return # Date out of range date
121 125
122 return date.strftime("%Y") 126 return date.strftime("%Y")
123 except ValueError: 127 except ValueError:
124 128 continue
125 # Try removing characters from the beginning and end
126 options = [string_date[:-1], string_date[1:], string_date[1:-1]]
127 for option in options:
128 try:
129 date = datetime.strptime(option, format_allowed)
130
131 if date.year > datetime.now().year or date.year < 1900:
132 return # Fecha fuera de rango
133
134 return date.strftime("%Y")
135 except ValueError:
136 continue
137 129
138 return 130 return
139 131
140 132
141 def get_format(phrase): 133 def get_format(phrase):
150 142
151 if amount_format in phrase or plural_amount_format in phrase: 143 if amount_format in phrase or plural_amount_format in phrase:
152 return amount_format 144 return amount_format
153 145
154 146
155 def clean(text): 147 def clean_text(text, text_type='default'):
156 """" 148 """"
157 Remove bad characters from word 149 Remove bad characters from word
158 """ 150 """
159 151
160 characters = ['.', ',', '-', ' '] 152 special_chars = [
153 '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
154 '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|',
155 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ','
156 ]
161 157
162 for character in characters: 158 if text_type == 'date':
163 text = text.replace(character, '') 159 allowed_chars = ['_', '-', '/']
160
161 # Sometimes date is '2020a' or 'b2020' because indexes
162 if text[-1].isalpha():
163 special_chars.append(text[-1])
164
165 if text[0].isalpha():
166 special_chars.append(text[0])
167 else:
168 allowed_chars = ['.', ',', '-', ' ']
169
170 special_chars = [char for char in special_chars if char not in allowed_chars]
171
172 for char in special_chars:
173 text = text.replace(char, '')
164 174
165 return text.lower() 175 return text.lower()