Mercurial > public > finance-parser
diff process_document/app.py @ 6:d15ccf5f1373
fix bug clean_text
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 17:12:18 +0100 |
parents | 9005b7590008 |
children | bf19235a9636 |
line wrap: on
line diff
--- a/process_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/process_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -51,7 +51,7 @@ cell_text += '_' # Verify if `Text` could be a valid date - date_string = is_date(clean_text(cell_text, 'date')) + date_string = is_date(cell_text) if date_string: cell_text = date_string result['dateRow'] = cell['RowIndex'] @@ -121,11 +121,23 @@ date = datetime.strptime(string_date, format_allowed) if date.year > datetime.now().year or date.year < 1900: - return # Date out of range date + return # Fecha fuera de rango return date.strftime("%Y") except ValueError: - continue + + # Try removing characters from the beginning and end + options = [string_date[:-1], string_date[1:], string_date[1:-1]] + for option in options: + try: + date = datetime.strptime(option, format_allowed) + + if date.year > datetime.now().year or date.year < 1900: + return # Fecha fuera de rango + + return date.strftime("%Y") + except ValueError: + continue return @@ -157,15 +169,8 @@ if text_type == 'date': allowed_chars = ['_', '-', '/'] - - # Sometimes date is '2020a' or 'b2020' because indexes - if text[-1].isalpha(): - special_chars.append(text[-1]) - - if text[0].isalpha(): - special_chars.append(text[0]) else: - allowed_chars = ['.', ',', '-', ' '] + allowed_chars = ['_'] special_chars = [char for char in special_chars if char not in allowed_chars]