annotate process_document/app.py @ 11:d09dee7a86da

fix KeyError bug
author Dennis C. M. <dennis@denniscm.com>
date Thu, 08 Jun 2023 17:35:26 +0100
parents bf19235a9636
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
1 import json
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
2 import boto3
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
3 from datetime import datetime
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
4 from collections import defaultdict
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
5
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
6
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
7 s3_client = boto3.client('s3')
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
8
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
9
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
10 def lambda_handler(event, context):
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
11 event_msg = event['body']['message']
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
12
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
13 # Download file from s3
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
14 s3_client.download_file(
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
15 event_msg['bucketName'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
16 event_msg['objectKey'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
17 '/tmp/document.json'
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
18 )
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
19
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
20 with open('/tmp/document.json') as f:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
21 doc = json.load(f)
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
22
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
23 # Analyze document
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
24 result = defaultdict(dict)
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
25 blocks = doc['Blocks']
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
26
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
27 # Get format
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
28 lines = filter_blocks(blocks, 'BlockType', 'LINE')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
29 for line in lines:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
30 amount_format = get_format(line['Text'])
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
31 result['format'] = amount_format
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
32 if amount_format:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
33 break
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
34
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
35 # Find dates value and position
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
36 data = defaultdict(dict)
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
37 cells = filter_blocks(blocks, 'BlockType', 'CELL')
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
38 for cell in cells:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
39 if not 'Relationships' in cell:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
40 continue
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
41
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
42 child_ids = [r['Ids'] for r in cell['Relationships'] if r['Type'] == 'CHILD'][0]
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
43
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
44 # Get `Text` from `CELL` block
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
45 cell_text = ''
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
46 for index, child_id in enumerate(child_ids):
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
47 word_block = filter_blocks(blocks, 'Id', child_id)[0]
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
48 cell_text += word_block['Text']
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
49
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
50 if index < len(child_ids) - 1:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
51 cell_text += '_'
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
52
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
53 # Verify if `Text` could be a valid date
6
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
54 date_string = is_date(cell_text)
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
55 if date_string:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
56 cell_text = date_string
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
57 result['dateRow'] = cell['RowIndex']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
58 result['dateColumns'][cell['ColumnIndex']] = date_string
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
59
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
60 cell_row_index = cell['RowIndex']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
61 cell_column_index = cell['ColumnIndex']
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
62 data[cell_row_index][cell_column_index] = clean_text(cell_text)
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
63
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
64 try:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
65 data[cell_row_index]['type'] = cell['EntityTypes']
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
66 except KeyError:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
67 pass
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
68
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
69 # Delete unused row and columns
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
70 for row_index in list(data.keys()):
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
71 row = data[row_index]
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
72 for column_index in list(row.keys()):
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
73 if column_index not in result['dateColumns'] \
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
74 and column_index != 1 and column_index != 'type':
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
75 del row[column_index]
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
76
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
77 if len(row) > 1:
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
78 result['data'][row_index] = row
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
79
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
80 object_key = event_msg['objectKey'].replace('analyzed/', 'processed/')
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
81 data_string = json.dumps(result, indent=2, default=str)
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
82
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
83 s3_client.put_object(
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
84 Bucket=event_msg['bucketName'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
85 Key=object_key,
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
86 Body=data_string
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
87 )
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
88
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
89 return {
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
90 "statusCode": 200,
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
91 "body": {
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
92 "message": {
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
93 "companyTicker": event_msg['companyTicker'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
94 "docType": event_msg['docType'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
95 "fileId": event_msg['fileId'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
96 "fileName": event_msg['fileName'],
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
97 "objectKey": object_key,
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
98 "bucketName": event_msg['bucketName']
3
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
99 }
2e5f3664f3e4 documents analyzer almost finished
Dennis C. M. <dennis@denniscm.com>
parents: 2
diff changeset
100 },
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
101 }
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
102
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
103
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
104 def filter_blocks(blocks, block_key, block_value):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
105 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
106 Extract a block by key-value from array of blocks
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
107 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
108
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
109 return [block for block in blocks if block[block_key] == block_value]
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
110
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
111
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
112 def is_date(string_date):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
113 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
114 Verify if a string could be a date.
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
115 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
116
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
117 formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y']
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
118
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
119 for format_allowed in formats_allowed:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
120 try:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
121 date = datetime.strptime(string_date, format_allowed)
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
122
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
123 if date.year > datetime.now().year or date.year < 1900:
6
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
124 return # Fecha fuera de rango
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
125
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
126 return date.strftime("%Y")
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
127 except ValueError:
6
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
128
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
129 # Try removing characters from the beginning and end
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
130 options = [string_date[:-1], string_date[1:], string_date[1:-1]]
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
131 for option in options:
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
132 try:
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
133 date = datetime.strptime(option, format_allowed)
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
134
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
135 if date.year > datetime.now().year or date.year < 1900:
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
136 return # Fecha fuera de rango
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
137
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
138 return date.strftime("%Y")
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
139 except ValueError:
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
140 continue
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
141
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
142 return
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
143
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
144
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
145 def get_format(phrase):
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
146 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
147 Given a phrase verify if it is specified the amount format
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
148 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
149
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
150 amount_formats = ['thousand', 'million', 'billion']
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
151
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
152 for amount_format in amount_formats:
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
153 plural_amount_format = f'{amount_format}s'
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
154
9
bf19235a9636 minor bugs and add sample reports
Dennis C. M. <dennis@denniscm.com>
parents: 6
diff changeset
155 if amount_format in phrase.lower() or plural_amount_format in phrase.lower():
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
156 return amount_format
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
157
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
158
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
159 def clean_text(text, text_type='default'):
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
160 """"
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
161 Remove bad characters from word
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
162 """
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
163
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
164 special_chars = [
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
165 '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
166 '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|',
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
167 ';', ':', '"', '\'', '<', '>', '/', '?', '.', ','
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
168 ]
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
169
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
170 if text_type == 'date':
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
171 allowed_chars = ['_', '-', '/']
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
172 else:
6
d15ccf5f1373 fix bug clean_text
Dennis C. M. <dennis@denniscm.com>
parents: 4
diff changeset
173 allowed_chars = ['_']
4
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
174
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
175 special_chars = [char for char in special_chars if char not in allowed_chars]
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
176
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
177 for char in special_chars:
9005b7590008 state machine working
Dennis C. M. <dennis@denniscm.com>
parents: 3
diff changeset
178 text = text.replace(char, '')
2
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
179
ef8a4d95755a add aws sam project
Dennis C. M. <dennis@denniscm.com>
parents:
diff changeset
180 return text.lower()