From b4765cfab19c7e936f2b77509cb6c758b3104b4c Mon Sep 17 00:00:00 2001 From: Dakota Date: Wed, 12 Jul 2023 09:10:01 -0500 Subject: [PATCH 1/2] fixed by adding bos and eos manually --- llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.py b/llama.py index 1a7a61f..a2e0f71 100644 --- a/llama.py +++ b/llama.py @@ -486,8 +486,8 @@ def packed_dataset(tokenizer, dataset: str): ds = load_dataset(dataset, split="train") all_tokens = [] for i in tqdm(range(0, len(ds), 4096)): - tokens_batch = tokenizer.encode(ds[i:i+4096]["text"], add_eos=True) - tokens_batch = [np.array(tokens, dtype=np.uint16) for tokens in tokens_batch] + tokens_batch = tokenizer.encode(ds[i:i+4096]["text"], add_eos=True) # Doesn't actually add EOS :( + tokens_batch = [np.array([1] + tokens + [2], dtype=np.uint16) for tokens in tokens_batch] all_tokens.extend(tokens_batch) flattened = np.concatenate(all_tokens) From d6c3cafefa27b2782ff5ea8e944effb201023078 Mon Sep 17 00:00:00 2001 From: Dakota Date: Wed, 12 Jul 2023 09:10:43 -0500 Subject: [PATCH 2/2] fixed by adding bos and eos manually removed add_eos in case it's fixed in a future update --- llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.py b/llama.py index a2e0f71..246819e 100644 --- a/llama.py +++ b/llama.py @@ -486,7 +486,7 @@ def packed_dataset(tokenizer, dataset: str): ds = load_dataset(dataset, split="train") all_tokens = [] for i in tqdm(range(0, len(ds), 4096)): - tokens_batch = tokenizer.encode(ds[i:i+4096]["text"], add_eos=True) # Doesn't actually add EOS :( + tokens_batch = tokenizer.encode(ds[i:i+4096]["text"]) tokens_batch = [np.array([1] + tokens + [2], dtype=np.uint16) for tokens in tokens_batch] all_tokens.extend(tokens_batch)