diff process_document/app.py @ 6:d15ccf5f1373

fix bug clean_text
author Dennis C. M. <dennis@denniscm.com>
date Mon, 05 Jun 2023 17:12:18 +0100
parents 9005b7590008
children bf19235a9636
line wrap: on
line diff
--- a/process_document/app.py	Mon Jun 05 12:48:47 2023 +0100
+++ b/process_document/app.py	Mon Jun 05 17:12:18 2023 +0100
@@ -51,7 +51,7 @@
                 cell_text += '_'
 
         # Verify if `Text` could be a valid date
-        date_string = is_date(clean_text(cell_text, 'date'))
+        date_string = is_date(cell_text)
         if date_string:
             cell_text = date_string
             result['dateRow'] = cell['RowIndex']
@@ -121,11 +121,23 @@
             date = datetime.strptime(string_date, format_allowed)
 
             if date.year > datetime.now().year or date.year < 1900:
-                return  # Date out of range date
+                return  # Fecha fuera de rango
 
             return date.strftime("%Y")
         except ValueError:
-            continue
+
+            # Try removing characters from the beginning and end
+            options = [string_date[:-1], string_date[1:], string_date[1:-1]]
+            for option in options:
+                try:
+                    date = datetime.strptime(option, format_allowed)
+
+                    if date.year > datetime.now().year or date.year < 1900:
+                        return  # Fecha fuera de rango
+
+                    return date.strftime("%Y")
+                except ValueError:
+                    continue
 
     return
 
@@ -157,15 +169,8 @@
 
     if text_type == 'date':
         allowed_chars = ['_', '-', '/']
-
-        # Sometimes date is '2020a' or 'b2020' because indexes
-        if text[-1].isalpha():
-            special_chars.append(text[-1])
-
-        if text[0].isalpha():
-            special_chars.append(text[0])
     else:
-        allowed_chars = ['.', ',', '-', ' ']
+        allowed_chars = ['_']
 
     special_chars = [char for char in special_chars if char not in allowed_chars]