changeset 4:9005b7590008

state machine working
author Dennis C. M. <dennis@denniscm.com>
date Mon, 05 Jun 2023 10:13:43 +0100
parents 2e5f3664f3e4
children 2daf0dc08247
files analyze_document/app.py events/analyze_document_event.json events/process_document_event.json events/upload_document_event.json process_document/app.py reports/itx_balance.pdf reports/san_balance.pdf upload_document/app.py
diffstat 8 files changed, 68 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/analyze_document/app.py	Fri Jun 02 20:12:29 2023 +0100
+++ b/analyze_document/app.py	Mon Jun 05 10:13:43 2023 +0100
@@ -12,7 +12,10 @@
     event_detail = event['detail']
     bucket_name = event_detail['bucket']['name']
     object_key = event_detail['object']['key']
-    company_ticker = re.search('unprocessed/(.*).pdf', object_key).group(1)
+
+    company_ticker = re.search('unprocessed/(.*)_', object_key).group(1)
+    doc_type = re.search(f'unprocessed/{company_ticker}_(.*).pdf', object_key).group(1)
+    file_id = uuid.uuid4()
 
     data_dict = textract_client.analyze_document(
         Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
@@ -20,7 +23,7 @@
     )
 
     data_string = json.dumps(data_dict, indent=2, default=str)
-    filename = f'{company_ticker}_{uuid.uuid4()}.json'
+    filename = f'{company_ticker}_{doc_type}_{file_id}.json'
 
     s3_client.put_object(
         Bucket=bucket_name,
@@ -37,6 +40,10 @@
         "statusCode": 200,
         "body": {
             "message": {
+                "companyTicker": company_ticker,
+                "docType": doc_type,
+                "fileId": file_id,
+                "fileName": filename,
                 "objectKey": f'analyzed/{filename}',
                 "bucketName": bucket_name
             }
--- a/events/analyze_document_event.json	Fri Jun 02 20:12:29 2023 +0100
+++ b/events/analyze_document_event.json	Mon Jun 05 10:13:43 2023 +0100
@@ -15,7 +15,7 @@
          "name":"sandbox-finance-parser-data"
       },
       "object":{
-         "key":"unprocessed/san.pdf",
+         "key":"unprocessed/san_balance.pdf",
          "size":49856,
          "etag":"0adc595c8f2dbfabb5c4095f1f91b458",
          "sequencer":"00647A159E6438B1A6"
--- a/events/process_document_event.json	Fri Jun 02 20:12:29 2023 +0100
+++ b/events/process_document_event.json	Mon Jun 05 10:13:43 2023 +0100
@@ -2,7 +2,7 @@
    "statusCode": 200,
    "body": {
       "message": {
-         "objectKey": "analyzed/san_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json",
+         "objectKey": "analyzed/san_balance_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json",
          "bucketName": "sandbox-finance-parser-data"
       }
    }
--- a/events/upload_document_event.json	Fri Jun 02 20:12:29 2023 +0100
+++ b/events/upload_document_event.json	Mon Jun 05 10:13:43 2023 +0100
@@ -2,7 +2,7 @@
    "statusCode": 200,
    "body": {
       "message": {
-         "objectKey": "processed/san_d7312109-9099-4dd2-a984-55768641b25e.json",
+         "objectKey": "processed/san_balance_d7312109-9099-4dd2-a984-55768641b25e.json",
          "bucketName": "sandbox-finance-parser-data"
       }
    }
--- a/process_document/app.py	Fri Jun 02 20:12:29 2023 +0100
+++ b/process_document/app.py	Mon Jun 05 10:13:43 2023 +0100
@@ -8,12 +8,14 @@
 
 
 def lambda_handler(event, context):
-    event_message = event['body']['message']
-    object_key = event_message['objectKey']
-    bucket_name = event_message['bucketName']
+    event_msg = event['body']['message']
 
     # Download file from s3
-    s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
+    s3_client.download_file(
+        event_msg['bucketName'],
+        event_msg['objectKey'],
+        '/tmp/document.json'
+    )
 
     with open('/tmp/document.json') as f:
         doc = json.load(f)
@@ -49,7 +51,7 @@
                 cell_text += '_'
 
         # Verify if `Text` could be a valid date
-        date_string = is_date(cell_text)
+        date_string = is_date(clean_text(cell_text, 'date'))
         if date_string:
             cell_text = date_string
             result['dateRow'] = cell['RowIndex']
@@ -57,7 +59,7 @@
 
         cell_row_index = cell['RowIndex']
         cell_column_index = cell['ColumnIndex']
-        data[cell_row_index][cell_column_index] = clean(cell_text)
+        data[cell_row_index][cell_column_index] = clean_text(cell_text)
 
         try:
             data[cell_row_index]['type'] = cell['EntityTypes']
@@ -75,12 +77,12 @@
             if len(row) > 1:
                 result['data'][row_index] = row
 
-    filename = object_key.replace('analyzed/', 'processed/')
+    object_key = event_msg['objectKey'].replace('analyzed/', 'processed/')
     data_string = json.dumps(result, indent=2, default=str)
 
     s3_client.put_object(
-        Bucket=bucket_name,
-        Key=filename,
+        Bucket=event_msg['bucketName'],
+        Key=object_key,
         Body=data_string
     )
 
@@ -88,8 +90,12 @@
         "statusCode": 200,
         "body": {
             "message": {
-                "objectKey": filename,
-                "bucketName": bucket_name
+                "companyTicker": event_msg['companyTicker'],
+                "docType": event_msg['docType'],
+                "fileId": event_msg['fileId'],
+                "fileName": event_msg['fileName'],
+                "objectKey": object_key,
+                "bucketName": event_msg['bucketName']
             }
         },
     }
@@ -106,34 +112,20 @@
 def is_date(string_date):
     """
     Verify if a string could be a date.
-
-    -> Funciona pero es un desastre <-
     """
 
-    formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
+    formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y']
 
     for format_allowed in formats_allowed:
         try:
             date = datetime.strptime(string_date, format_allowed)
 
             if date.year > datetime.now().year or date.year < 1900:
-                return  # Fecha fuera de rango
+                return  # Date out of range date
 
             return date.strftime("%Y")
         except ValueError:
-
-            # Try removing characters from the beginning and end
-            options = [string_date[:-1], string_date[1:], string_date[1:-1]]
-            for option in options:
-                try:
-                    date = datetime.strptime(option, format_allowed)
-
-                    if date.year > datetime.now().year or date.year < 1900:
-                        return  # Fecha fuera de rango
-
-                    return date.strftime("%Y")
-                except ValueError:
-                    continue
+            continue
 
     return
 
@@ -152,14 +144,32 @@
             return amount_format
 
 
-def clean(text):
+def clean_text(text, text_type='default'):
     """"
     Remove bad characters from word
     """
 
-    characters = ['.', ',', '-', ' ']
+    special_chars = [
+        '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
+        '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|',
+        ';', ':', '"', '\'', '<', '>', '/', '?', '.', ','
+    ]
+
+    if text_type == 'date':
+        allowed_chars = ['_', '-', '/']
 
-    for character in characters:
-        text = text.replace(character, '')
+        # Sometimes date is '2020a' or 'b2020' because indexes
+        if text[-1].isalpha():
+            special_chars.append(text[-1])
+
+        if text[0].isalpha():
+            special_chars.append(text[0])
+    else:
+        allowed_chars = ['.', ',', '-', ' ']
+
+    special_chars = [char for char in special_chars if char not in allowed_chars]
+
+    for char in special_chars:
+        text = text.replace(char, '')
 
     return text.lower()
Binary file reports/itx_balance.pdf has changed
Binary file reports/san_balance.pdf has changed
--- a/upload_document/app.py	Fri Jun 02 20:12:29 2023 +0100
+++ b/upload_document/app.py	Mon Jun 05 10:13:43 2023 +0100
@@ -1,6 +1,6 @@
 import json
 import boto3
-import re
+
 
 s3_client = boto3.client('s3')
 dynamodb = boto3.resource('dynamodb')
@@ -8,13 +8,14 @@
 
 
 def lambda_handler(event, context):
-    event_message = event['body']['message']
-    object_key = event_message['objectKey']
-    bucket_name = event_message['bucketName']
-    company_ticker = re.search('processed/(.*)_', object_key).group(1)
+    event_msg = event['body']['message']
 
     # Download file from s3
-    s3_client.download_file(bucket_name, object_key, '/tmp/document.json')
+    s3_client.download_file(
+        event_msg['bucketName'],
+        event_msg['objectKey'],
+        '/tmp/document.json'
+    )
 
     with open('/tmp/document.json') as f:
         doc = json.load(f)
@@ -28,9 +29,7 @@
                 column_types = []
 
             """
-            The following statement avoids getting a `2020` as the value 
-            of `ASSETS`.
-            
+            Given:
             +------------------+------+------+
             | ASSETS           | 2020 | 2019 |
             +------------------+------+------+
@@ -38,6 +37,8 @@
             +------------------+------+------+
             | ASSETS_ACCOUNT_2 |      |      |
             +------------------+------+------+
+            
+            The following statement avoids getting `2020` as the value of `ASSETS`.
             """
 
             account_value = account[dateColumn]
@@ -51,7 +52,7 @@
 
                 batch.put_item(
                     Item={
-                        'pk': f'balance#{company_ticker}',
+                        'pk': f"balance#{event_msg['companyTicker']}",
                         'sk': f'{date}#{row_index}',
                         'account_name': account['1'],
                         'account_value': account_value,
@@ -60,13 +61,12 @@
                 )
 
         # pk -> item_type#company_ticker
-        # sk -> date
+        # sk -> date#filename
 
         table.put_item(
             Item={
-                'pk': f'file#{company_ticker}',
-                'sk': f"{date}",
-                'filename': object_key.replace('processed/', '')
+                'pk': f"file#{event_msg['companyTicker']}",
+                'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}"
             }
         )