Mercurial > public > finance-parser
comparison process_document/app.py @ 6:d15ccf5f1373
fix bug clean_text
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 17:12:18 +0100 |
parents | 9005b7590008 |
children | bf19235a9636 |
comparison
equal
deleted
inserted
replaced
5:2daf0dc08247 | 6:d15ccf5f1373 |
---|---|
49 | 49 |
50 if index < len(child_ids) - 1: | 50 if index < len(child_ids) - 1: |
51 cell_text += '_' | 51 cell_text += '_' |
52 | 52 |
53 # Verify if `Text` could be a valid date | 53 # Verify if `Text` could be a valid date |
54 date_string = is_date(clean_text(cell_text, 'date')) | 54 date_string = is_date(cell_text) |
55 if date_string: | 55 if date_string: |
56 cell_text = date_string | 56 cell_text = date_string |
57 result['dateRow'] = cell['RowIndex'] | 57 result['dateRow'] = cell['RowIndex'] |
58 result['dateColumns'][cell['ColumnIndex']] = date_string | 58 result['dateColumns'][cell['ColumnIndex']] = date_string |
59 | 59 |
119 for format_allowed in formats_allowed: | 119 for format_allowed in formats_allowed: |
120 try: | 120 try: |
121 date = datetime.strptime(string_date, format_allowed) | 121 date = datetime.strptime(string_date, format_allowed) |
122 | 122 |
123 if date.year > datetime.now().year or date.year < 1900: | 123 if date.year > datetime.now().year or date.year < 1900: |
124 return # Date out of range date | 124 return # Fecha fuera de rango |
125 | 125 |
126 return date.strftime("%Y") | 126 return date.strftime("%Y") |
127 except ValueError: | 127 except ValueError: |
128 continue | 128 |
129 # Try removing characters from the beginning and end | |
130 options = [string_date[:-1], string_date[1:], string_date[1:-1]] | |
131 for option in options: | |
132 try: | |
133 date = datetime.strptime(option, format_allowed) | |
134 | |
135 if date.year > datetime.now().year or date.year < 1900: | |
136 return # Fecha fuera de rango | |
137 | |
138 return date.strftime("%Y") | |
139 except ValueError: | |
140 continue | |
129 | 141 |
130 return | 142 return |
131 | 143 |
132 | 144 |
133 def get_format(phrase): | 145 def get_format(phrase): |
155 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ',' | 167 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ',' |
156 ] | 168 ] |
157 | 169 |
158 if text_type == 'date': | 170 if text_type == 'date': |
159 allowed_chars = ['_', '-', '/'] | 171 allowed_chars = ['_', '-', '/'] |
160 | |
161 # Sometimes date is '2020a' or 'b2020' because indexes | |
162 if text[-1].isalpha(): | |
163 special_chars.append(text[-1]) | |
164 | |
165 if text[0].isalpha(): | |
166 special_chars.append(text[0]) | |
167 else: | 172 else: |
168 allowed_chars = ['.', ',', '-', ' '] | 173 allowed_chars = ['_'] |
169 | 174 |
170 special_chars = [char for char in special_chars if char not in allowed_chars] | 175 special_chars = [char for char in special_chars if char not in allowed_chars] |
171 | 176 |
172 for char in special_chars: | 177 for char in special_chars: |
173 text = text.replace(char, '') | 178 text = text.replace(char, '') |