Mercurial > public > finance-parser
comparison process_document/app.py @ 4:9005b7590008
state machine working
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 10:13:43 +0100 |
parents | 2e5f3664f3e4 |
children | d15ccf5f1373 |
comparison
equal
deleted
inserted
replaced
3:2e5f3664f3e4 | 4:9005b7590008 |
---|---|
6 | 6 |
7 s3_client = boto3.client('s3') | 7 s3_client = boto3.client('s3') |
8 | 8 |
9 | 9 |
10 def lambda_handler(event, context): | 10 def lambda_handler(event, context): |
11 event_message = event['body']['message'] | 11 event_msg = event['body']['message'] |
12 object_key = event_message['objectKey'] | |
13 bucket_name = event_message['bucketName'] | |
14 | 12 |
15 # Download file from s3 | 13 # Download file from s3 |
16 s3_client.download_file(bucket_name, object_key, '/tmp/document.json') | 14 s3_client.download_file( |
15 event_msg['bucketName'], | |
16 event_msg['objectKey'], | |
17 '/tmp/document.json' | |
18 ) | |
17 | 19 |
18 with open('/tmp/document.json') as f: | 20 with open('/tmp/document.json') as f: |
19 doc = json.load(f) | 21 doc = json.load(f) |
20 | 22 |
21 # Analyze document | 23 # Analyze document |
47 | 49 |
48 if index < len(child_ids) - 1: | 50 if index < len(child_ids) - 1: |
49 cell_text += '_' | 51 cell_text += '_' |
50 | 52 |
51 # Verify if `Text` could be a valid date | 53 # Verify if `Text` could be a valid date |
52 date_string = is_date(cell_text) | 54 date_string = is_date(clean_text(cell_text, 'date')) |
53 if date_string: | 55 if date_string: |
54 cell_text = date_string | 56 cell_text = date_string |
55 result['dateRow'] = cell['RowIndex'] | 57 result['dateRow'] = cell['RowIndex'] |
56 result['dateColumns'][cell['ColumnIndex']] = date_string | 58 result['dateColumns'][cell['ColumnIndex']] = date_string |
57 | 59 |
58 cell_row_index = cell['RowIndex'] | 60 cell_row_index = cell['RowIndex'] |
59 cell_column_index = cell['ColumnIndex'] | 61 cell_column_index = cell['ColumnIndex'] |
60 data[cell_row_index][cell_column_index] = clean(cell_text) | 62 data[cell_row_index][cell_column_index] = clean_text(cell_text) |
61 | 63 |
62 try: | 64 try: |
63 data[cell_row_index]['type'] = cell['EntityTypes'] | 65 data[cell_row_index]['type'] = cell['EntityTypes'] |
64 except KeyError: | 66 except KeyError: |
65 pass | 67 pass |
73 del row[column_index] | 75 del row[column_index] |
74 | 76 |
75 if len(row) > 1: | 77 if len(row) > 1: |
76 result['data'][row_index] = row | 78 result['data'][row_index] = row |
77 | 79 |
78 filename = object_key.replace('analyzed/', 'processed/') | 80 object_key = event_msg['objectKey'].replace('analyzed/', 'processed/') |
79 data_string = json.dumps(result, indent=2, default=str) | 81 data_string = json.dumps(result, indent=2, default=str) |
80 | 82 |
81 s3_client.put_object( | 83 s3_client.put_object( |
82 Bucket=bucket_name, | 84 Bucket=event_msg['bucketName'], |
83 Key=filename, | 85 Key=object_key, |
84 Body=data_string | 86 Body=data_string |
85 ) | 87 ) |
86 | 88 |
87 return { | 89 return { |
88 "statusCode": 200, | 90 "statusCode": 200, |
89 "body": { | 91 "body": { |
90 "message": { | 92 "message": { |
91 "objectKey": filename, | 93 "companyTicker": event_msg['companyTicker'], |
92 "bucketName": bucket_name | 94 "docType": event_msg['docType'], |
95 "fileId": event_msg['fileId'], | |
96 "fileName": event_msg['fileName'], | |
97 "objectKey": object_key, | |
98 "bucketName": event_msg['bucketName'] | |
93 } | 99 } |
94 }, | 100 }, |
95 } | 101 } |
96 | 102 |
97 | 103 |
104 | 110 |
105 | 111 |
106 def is_date(string_date): | 112 def is_date(string_date): |
107 """ | 113 """ |
108 Verify if a string could be a date. | 114 Verify if a string could be a date. |
109 | |
110 -> Funciona pero es un desastre <- | |
111 """ | 115 """ |
112 | 116 |
113 formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'] | 117 formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y'] |
114 | 118 |
115 for format_allowed in formats_allowed: | 119 for format_allowed in formats_allowed: |
116 try: | 120 try: |
117 date = datetime.strptime(string_date, format_allowed) | 121 date = datetime.strptime(string_date, format_allowed) |
118 | 122 |
119 if date.year > datetime.now().year or date.year < 1900: | 123 if date.year > datetime.now().year or date.year < 1900: |
120 return # Fecha fuera de rango | 124 return # Date out of range date |
121 | 125 |
122 return date.strftime("%Y") | 126 return date.strftime("%Y") |
123 except ValueError: | 127 except ValueError: |
124 | 128 continue |
125 # Try removing characters from the beginning and end | |
126 options = [string_date[:-1], string_date[1:], string_date[1:-1]] | |
127 for option in options: | |
128 try: | |
129 date = datetime.strptime(option, format_allowed) | |
130 | |
131 if date.year > datetime.now().year or date.year < 1900: | |
132 return # Fecha fuera de rango | |
133 | |
134 return date.strftime("%Y") | |
135 except ValueError: | |
136 continue | |
137 | 129 |
138 return | 130 return |
139 | 131 |
140 | 132 |
141 def get_format(phrase): | 133 def get_format(phrase): |
150 | 142 |
151 if amount_format in phrase or plural_amount_format in phrase: | 143 if amount_format in phrase or plural_amount_format in phrase: |
152 return amount_format | 144 return amount_format |
153 | 145 |
154 | 146 |
155 def clean(text): | 147 def clean_text(text, text_type='default'): |
156 """" | 148 """" |
157 Remove bad characters from word | 149 Remove bad characters from word |
158 """ | 150 """ |
159 | 151 |
160 characters = ['.', ',', '-', ' '] | 152 special_chars = [ |
153 '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', | |
154 '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|', | |
155 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ',' | |
156 ] | |
161 | 157 |
162 for character in characters: | 158 if text_type == 'date': |
163 text = text.replace(character, '') | 159 allowed_chars = ['_', '-', '/'] |
160 | |
161 # Sometimes date is '2020a' or 'b2020' because indexes | |
162 if text[-1].isalpha(): | |
163 special_chars.append(text[-1]) | |
164 | |
165 if text[0].isalpha(): | |
166 special_chars.append(text[0]) | |
167 else: | |
168 allowed_chars = ['.', ',', '-', ' '] | |
169 | |
170 special_chars = [char for char in special_chars if char not in allowed_chars] | |
171 | |
172 for char in special_chars: | |
173 text = text.replace(char, '') | |
164 | 174 |
165 return text.lower() | 175 return text.lower() |