finance-parser: process_document/app.py comparison

comparison process_document/app.py @ 4:9005b7590008

state machine working

author	Dennis C. M. <dennis@denniscm.com>
date	Mon, 05 Jun 2023 10:13:43 +0100
parents	2e5f3664f3e4
children	d15ccf5f1373

comparison

equal deleted inserted replaced

-:2e5f3664f3e4
+:9005b7590008
 s3_client = boto3.client('s3')
 def lambda_handler(event, context):
-event_message = event['body']['message']
+event_msg = event['body']['message']
-object_key = event_message['objectKey']
-bucket_name = event_message['bucketName']
 # Download file from s3
-s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
+s3_client.download_file(
+event_msg['bucketName'],
+event_msg['objectKey'],
+'/tmp/document.json'
+)
 with open('/tmp/document.json') as f:
 doc = json.load(f)
 # Analyze document
 if index < len(child_ids) - 1:
 cell_text += '_'
 # Verify if `Text` could be a valid date
-date_string = is_date(cell_text)
+date_string = is_date(clean_text(cell_text, 'date'))
 if date_string:
 cell_text = date_string
 result['dateRow'] = cell['RowIndex']
 result['dateColumns'][cell['ColumnIndex']] = date_string
 cell_row_index = cell['RowIndex']
 cell_column_index = cell['ColumnIndex']
-data[cell_row_index][cell_column_index] = clean(cell_text)
+data[cell_row_index][cell_column_index] = clean_text(cell_text)
 try:
 data[cell_row_index]['type'] = cell['EntityTypes']
 except KeyError:
 pass
 del row[column_index]
 if len(row) > 1:
 result['data'][row_index] = row
-filename = object_key.replace('analyzed/', 'processed/')
+object_key = event_msg['objectKey'].replace('analyzed/', 'processed/')
 data_string = json.dumps(result, indent=2, default=str)
 s3_client.put_object(
-Bucket=bucket_name,
+Bucket=event_msg['bucketName'],
-Key=filename,
+Key=object_key,
 Body=data_string
 )
 return {
 "statusCode": 200,
 "body": {
 "message": {
-"objectKey": filename,
+"companyTicker": event_msg['companyTicker'],
-"bucketName": bucket_name
+"docType": event_msg['docType'],
+"fileId": event_msg['fileId'],
+"fileName": event_msg['fileName'],
+"objectKey": object_key,
+"bucketName": event_msg['bucketName']
 }
 },
 }
 def is_date(string_date):
 """
 Verify if a string could be a date.
--> Funciona pero es un desastre <-
 """
-formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
+formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y']
 for format_allowed in formats_allowed:
 try:
 date = datetime.strptime(string_date, format_allowed)
 if date.year > datetime.now().year or date.year < 1900:
-return  # Fecha fuera de rango
+return  # Date out of range date
 return date.strftime("%Y")
 except ValueError:
+continue
-# Try removing characters from the beginning and end
-options = [string_date[:-1], string_date[1:], string_date[1:-1]]
-for option in options:
-try:
-date = datetime.strptime(option, format_allowed)
-if date.year > datetime.now().year or date.year < 1900:
-return  # Fecha fuera de rango
-return date.strftime("%Y")
-except ValueError:
-continue
 return
 def get_format(phrase):
 if amount_format in phrase or plural_amount_format in phrase:
 return amount_format
-def clean(text):
+def clean_text(text, text_type='default'):
 """"
 Remove bad characters from word
 """
-characters = ['.', ',', '-', ' ']
+special_chars = [
+'!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
+'-', '_', '+', '=', '[', ']', '{', '}', '\\', '|',
+';', ':', '"', '\'', '<', '>', '/', '?', '.', ','
+]
-for character in characters:
+if text_type == 'date':
-text = text.replace(character, '')
+allowed_chars = ['_', '-', '/']
+# Sometimes date is '2020a' or 'b2020' because indexes
+if text[-1].isalpha():
+special_chars.append(text[-1])
+if text[0].isalpha():
+special_chars.append(text[0])
+else:
+allowed_chars = ['.', ',', '-', ' ']
+special_chars = [char for char in special_chars if char not in allowed_chars]
+for char in special_chars:
+text = text.replace(char, '')
 return text.lower()

Mercurial > public > finance-parser

comparison process_document/app.py @ 4:9005b7590008