Mercurial > public > finance-parser

diff process_document/app.py @ 4:9005b7590008
state machine working
author: Dennis C. M. <dennis@denniscm.com>
date: Mon, 05 Jun 2023 10:13:43 +0100
parents: 2e5f3664f3e4
children: d15ccf5f1373
--- a/process_document/app.py	Fri Jun 02 20:12:29 2023 +0100
+++ b/process_document/app.py	Mon Jun 05 10:13:43 2023 +0100
@@ -8,12 +8,14 @@
 
 
 def lambda_handler(event, context):
-    event_message = event['body']['message']
-    object_key = event_message['objectKey']
-    bucket_name = event_message['bucketName']
+    event_msg = event['body']['message']
 
     # Download file from s3
-    s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
+    s3_client.download_file(
+        event_msg['bucketName'],
+        event_msg['objectKey'],
+        '/tmp/document.json'
+    )
 
     with open('/tmp/document.json') as f:
         doc = json.load(f)
@@ -49,7 +51,7 @@
                 cell_text += '_'
 
         # Verify if `Text` could be a valid date
-        date_string = is_date(cell_text)
+        date_string = is_date(clean_text(cell_text, 'date'))
         if date_string:
             cell_text = date_string
             result['dateRow'] = cell['RowIndex']
@@ -57,7 +59,7 @@
 
         cell_row_index = cell['RowIndex']
         cell_column_index = cell['ColumnIndex']
-        data[cell_row_index][cell_column_index] = clean(cell_text)
+        data[cell_row_index][cell_column_index] = clean_text(cell_text)
 
         try:
             data[cell_row_index]['type'] = cell['EntityTypes']
@@ -75,12 +77,12 @@
             if len(row) > 1:
                 result['data'][row_index] = row
 
-    filename = object_key.replace('analyzed/', 'processed/')
+    object_key = event_msg['objectKey'].replace('analyzed/', 'processed/')
     data_string = json.dumps(result, indent=2, default=str)
 
     s3_client.put_object(
-        Bucket=bucket_name,
-        Key=filename,
+        Bucket=event_msg['bucketName'],
+        Key=object_key,
         Body=data_string
     )
 
@@ -88,8 +90,12 @@
         "statusCode": 200,
         "body": {
             "message": {
-                "objectKey": filename,
-                "bucketName": bucket_name
+                "companyTicker": event_msg['companyTicker'],
+                "docType": event_msg['docType'],
+                "fileId": event_msg['fileId'],
+                "fileName": event_msg['fileName'],
+                "objectKey": object_key,
+                "bucketName": event_msg['bucketName']
             }
         },
     }
@@ -106,34 +112,20 @@
 def is_date(string_date):
     """
     Verify if a string could be a date.
-
-    -> Funciona pero es un desastre <-
     """
 
-    formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
+    formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y']
 
     for format_allowed in formats_allowed:
         try:
             date = datetime.strptime(string_date, format_allowed)
 
             if date.year > datetime.now().year or date.year < 1900:
-                return  # Fecha fuera de rango
+                return  # Date out of range date
 
             return date.strftime("%Y")
         except ValueError:
-
-            # Try removing characters from the beginning and end
-            options = [string_date[:-1], string_date[1:], string_date[1:-1]]
-            for option in options:
-                try:
-                    date = datetime.strptime(option, format_allowed)
-
-                    if date.year > datetime.now().year or date.year < 1900:
-                        return  # Fecha fuera de rango
-
-                    return date.strftime("%Y")
-                except ValueError:
-                    continue
+            continue
 
     return
 
@@ -152,14 +144,32 @@
             return amount_format
 
 
-def clean(text):
+def clean_text(text, text_type='default'):
     """"
     Remove bad characters from word
     """
 
-    characters = ['.', ',', '-', ' ']
+    special_chars = [
+        '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
+        '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|',
+        ';', ':', '"', '\'', '<', '>', '/', '?', '.', ','
+    ]
+
+    if text_type == 'date':
+        allowed_chars = ['_', '-', '/']
 
-    for character in characters:
-        text = text.replace(character, '')
+        # Sometimes date is '2020a' or 'b2020' because indexes
+        if text[-1].isalpha():
+            special_chars.append(text[-1])
+
+        if text[0].isalpha():
+            special_chars.append(text[0])
+    else:
+        allowed_chars = ['.', ',', '-', ' ']
+
+    special_chars = [char for char in special_chars if char not in allowed_chars]
+
+    for char in special_chars:
+        text = text.replace(char, '')
 
     return text.lower()
author	Dennis C. M. <dennis@denniscm.com>
date	Mon, 05 Jun 2023 10:13:43 +0100
parents	2e5f3664f3e4
children	d15ccf5f1373