-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
33 lines (25 loc) · 800 Bytes
/
tokenizer.py
File metadata and controls
33 lines (25 loc) · 800 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
ascii_char_len = 126 - 32 + 1
special_token_ids = {
"<begin>": ascii_char_len,
"<eos>": ascii_char_len + 1,
}
special_id_tokens = {value: key for key, value in special_token_ids.items()}
vocab_size = ascii_char_len + len(special_token_ids) # ascii tokens + special_tokens
# the encoder does not encode special tokens
def encode(string):
return [ord(char) - 32 for char in string]
def decode(idx):
result = ''
for index in idx:
if index >= ascii_char_len:
result += special_id_tokens[index]
else:
result += chr(index + 32)
return result
if __name__ == "__main__":
s = "123+abc=ABC[,./+`~]"
idx = encode(s)
print(idx)
idx = [ascii_char_len] + idx + [ascii_char_len + 1]
decoded = decode(idx)
print(decoded)