comparison process_document/app.py @ 6:d15ccf5f1373

fix bug clean_text
author Dennis C. M. <dennis@denniscm.com>
date Mon, 05 Jun 2023 17:12:18 +0100
parents 9005b7590008
children bf19235a9636
comparison
equal deleted inserted replaced
5:2daf0dc08247 6:d15ccf5f1373
49 49
50 if index < len(child_ids) - 1: 50 if index < len(child_ids) - 1:
51 cell_text += '_' 51 cell_text += '_'
52 52
53 # Verify if `Text` could be a valid date 53 # Verify if `Text` could be a valid date
54 date_string = is_date(clean_text(cell_text, 'date')) 54 date_string = is_date(cell_text)
55 if date_string: 55 if date_string:
56 cell_text = date_string 56 cell_text = date_string
57 result['dateRow'] = cell['RowIndex'] 57 result['dateRow'] = cell['RowIndex']
58 result['dateColumns'][cell['ColumnIndex']] = date_string 58 result['dateColumns'][cell['ColumnIndex']] = date_string
59 59
119 for format_allowed in formats_allowed: 119 for format_allowed in formats_allowed:
120 try: 120 try:
121 date = datetime.strptime(string_date, format_allowed) 121 date = datetime.strptime(string_date, format_allowed)
122 122
123 if date.year > datetime.now().year or date.year < 1900: 123 if date.year > datetime.now().year or date.year < 1900:
124 return # Date out of range date 124 return # Fecha fuera de rango
125 125
126 return date.strftime("%Y") 126 return date.strftime("%Y")
127 except ValueError: 127 except ValueError:
128 continue 128
129 # Try removing characters from the beginning and end
130 options = [string_date[:-1], string_date[1:], string_date[1:-1]]
131 for option in options:
132 try:
133 date = datetime.strptime(option, format_allowed)
134
135 if date.year > datetime.now().year or date.year < 1900:
136 return # Fecha fuera de rango
137
138 return date.strftime("%Y")
139 except ValueError:
140 continue
129 141
130 return 142 return
131 143
132 144
133 def get_format(phrase): 145 def get_format(phrase):
155 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ',' 167 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ','
156 ] 168 ]
157 169
158 if text_type == 'date': 170 if text_type == 'date':
159 allowed_chars = ['_', '-', '/'] 171 allowed_chars = ['_', '-', '/']
160
161 # Sometimes date is '2020a' or 'b2020' because indexes
162 if text[-1].isalpha():
163 special_chars.append(text[-1])
164
165 if text[0].isalpha():
166 special_chars.append(text[0])
167 else: 172 else:
168 allowed_chars = ['.', ',', '-', ' '] 173 allowed_chars = ['_']
169 174
170 special_chars = [char for char in special_chars if char not in allowed_chars] 175 special_chars = [char for char in special_chars if char not in allowed_chars]
171 176
172 for char in special_chars: 177 for char in special_chars:
173 text = text.replace(char, '') 178 text = text.replace(char, '')