From 76456c1e82e83fcf748376c63c47d55f50b5f6e8 Mon Sep 17 00:00:00 2001 From: Qingyuan Wu Date: Thu, 16 Jun 2022 16:15:01 -0400 Subject: [PATCH 1/8] rename preprocess to preprocess-tifu --- {preprocess => preprocess-tifu}/tifu_analysis.py | 0 {preprocess => preprocess-tifu}/tifu_cumulative_stats.csv | 0 {preprocess => preprocess-tifu}/tifu_row_stats.csv | 0 {preprocess => preprocess-tifu}/tifu_tokenize.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {preprocess => preprocess-tifu}/tifu_analysis.py (100%) rename {preprocess => preprocess-tifu}/tifu_cumulative_stats.csv (100%) rename {preprocess => preprocess-tifu}/tifu_row_stats.csv (100%) rename {preprocess => preprocess-tifu}/tifu_tokenize.py (100%) diff --git a/preprocess/tifu_analysis.py b/preprocess-tifu/tifu_analysis.py similarity index 100% rename from preprocess/tifu_analysis.py rename to preprocess-tifu/tifu_analysis.py diff --git a/preprocess/tifu_cumulative_stats.csv b/preprocess-tifu/tifu_cumulative_stats.csv similarity index 100% rename from preprocess/tifu_cumulative_stats.csv rename to preprocess-tifu/tifu_cumulative_stats.csv diff --git a/preprocess/tifu_row_stats.csv b/preprocess-tifu/tifu_row_stats.csv similarity index 100% rename from preprocess/tifu_row_stats.csv rename to preprocess-tifu/tifu_row_stats.csv diff --git a/preprocess/tifu_tokenize.py b/preprocess-tifu/tifu_tokenize.py similarity index 100% rename from preprocess/tifu_tokenize.py rename to preprocess-tifu/tifu_tokenize.py From 7ae7dc0ed61e4789bb31157091fffd570408cdaf Mon Sep 17 00:00:00 2001 From: Qingyuan Wu Date: Fri, 17 Jun 2022 16:32:20 -0400 Subject: [PATCH 2/8] add hugging face usable data analysis --- .gitignore | 5 ++-- preprocess-hugging-face/analysis.py | 26 +++++++++++++++++++ .../hugging_face_cumulative_stats.csv | 2 ++ 3 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 preprocess-hugging-face/analysis.py create mode 100644 preprocess-hugging-face/hugging_face_cumulative_stats.csv diff --git a/.gitignore b/.gitignore index f9426c1..e530f86 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,9 @@ venv/ .vscode/ .DS_store/ -# json datasets -*/*.json +# datasets +*.json +*.jsonl.gz # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/preprocess-hugging-face/analysis.py b/preprocess-hugging-face/analysis.py new file mode 100644 index 0000000..5c19e22 --- /dev/null +++ b/preprocess-hugging-face/analysis.py @@ -0,0 +1,26 @@ +import json +import gzip +import re +import csv + +# hugging fact dataset: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main +filename = 'reddit_title_text_2011.jsonl.gz' + +json_content = [] +can_be_used = total_posts = 0 +with gzip.open(filename , 'rb') as gzip_file: + for line in gzip_file: + line = line.rstrip() + if line: + obj = json.loads(line) + json_content.append(obj) + title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"]) + body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"]) + if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60: + can_be_used += 1 + total_posts += 1 + +with open("hugging_face_cumulative_stats.csv", "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["Can be used", can_be_used]) + writer.writerow(["Total posts", total_posts]) diff --git a/preprocess-hugging-face/hugging_face_cumulative_stats.csv b/preprocess-hugging-face/hugging_face_cumulative_stats.csv new file mode 100644 index 0000000..75ce903 --- /dev/null +++ b/preprocess-hugging-face/hugging_face_cumulative_stats.csv @@ -0,0 +1,2 @@ +Can be used,451542 +Total posts,1673264 From d78650eab3771dcd9efa4c1fb6d2c25f203265f2 Mon Sep 17 00:00:00 2001 From: Qingyuan Wu Date: Tue, 21 Jun 2022 18:15:56 -0400 Subject: [PATCH 3/8] add to_do.md file --- preprocess-hugging-face/analysis.py | 2 ++ preprocess-hugging-face/to_do.md | 8 ++++++++ preprocess-tifu/tifu_tokenize.py | 1 + 3 files changed, 11 insertions(+) create mode 100644 preprocess-hugging-face/to_do.md diff --git a/preprocess-hugging-face/analysis.py b/preprocess-hugging-face/analysis.py index 5c19e22..1c71077 100644 --- a/preprocess-hugging-face/analysis.py +++ b/preprocess-hugging-face/analysis.py @@ -14,8 +14,10 @@ if line: obj = json.loads(line) json_content.append(obj) + # re.findall includes punctuation title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"]) body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"]) + # titles between 3 to 15 words and body between 15 and 60 words (including punctuation) if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60: can_be_used += 1 total_posts += 1 diff --git a/preprocess-hugging-face/to_do.md b/preprocess-hugging-face/to_do.md new file mode 100644 index 0000000..2a2198d --- /dev/null +++ b/preprocess-hugging-face/to_do.md @@ -0,0 +1,8 @@ +### Preprocessing, Tokenizing, One-Hot +1. Pull the `preprocess-hugging-face` branch from remote + * https://github.com/CloudClub-uoft/Title-Generator/tree/preprocess-hugging-face +2. look at `tifu_tokenize.py` under `/preprocess` or `/preprocess-tifu` directory + * this is the code I used to tokenize and one-hot encode the tifu dataset, which we're probably not using (since we found the hugging face one which is better) +3. Copy most of the code to tokenize and one-hot encode the `reddit_title_text_2011.jsonl.gz` dataset + * first download [here](https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main) + * confirm that all words are lowercase first before you tokenize! Might have to do `.lower()` diff --git a/preprocess-tifu/tifu_tokenize.py b/preprocess-tifu/tifu_tokenize.py index caa8689..a7fbcd3 100644 --- a/preprocess-tifu/tifu_tokenize.py +++ b/preprocess-tifu/tifu_tokenize.py @@ -54,5 +54,6 @@ def build_one_hot_from_tokens(tokens, max_length): tokens = generate_tokens(list_of_sentences) one_hot = build_one_hot_from_tokens(tokens, 60) +print(list_of_sentences) print(one_hot) print(one_hot.shape) # (4x60x51) \ No newline at end of file From 7b270a15db2459aa88293c8977d4bd566989aa57 Mon Sep 17 00:00:00 2001 From: Qingyuan Wu Date: Fri, 8 Jul 2022 23:17:56 -0400 Subject: [PATCH 4/8] preprocess folder --- preprocess/tifu_tokenize.py | 58 +++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 preprocess/tifu_tokenize.py diff --git a/preprocess/tifu_tokenize.py b/preprocess/tifu_tokenize.py new file mode 100644 index 0000000..438ac1e --- /dev/null +++ b/preprocess/tifu_tokenize.py @@ -0,0 +1,58 @@ +import numpy as np +import re +import json + +def build_list_of_sentences(file): + with open(file) as f: + # example run with the first few entries: + lines = f.readlines()[:10] + paragraphs = [] + for line in lines: + line = json.loads(line) + if line["tldr"] is not None: + paragraphs.append(line["tldr"]) + return paragraphs + +def tokenizer(words: str): + ''' + words is a string + ''' + return re.findall(r"[\w']+|[.,!?:;]", words) +def generate_tokens(list_of_words): + ''' + list_of_words is a list of strings + ''' + tokens = [tokenizer(words) for words in list_of_words] + return tokens + +def build_one_hot_from_tokens(tokens, max_length): + ''' + tokens should be a list of list of words + max_length is maximum length of all passages, for our project this will be 60 + ''' + # build an index of all tokens in the data + token_index = {} + i = 1 + for sample in tokens: + + for word in sample: + if word not in token_index: + # Assign a unique index to each unique word + token_index[word] = i + i += 1 + + # vectorize our tokens + results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1)) + for i, sample in enumerate(tokens): + for j, word in enumerate(sample[:max_length]): + index = token_index.get(word) + results[i, j, index] = 1. + + return results + +list_of_sentences = build_list_of_sentences("tifu_all_tokenized_and_filtered.json") +tokens = generate_tokens(list_of_sentences) +one_hot = build_one_hot_from_tokens(tokens, 60) + +print(one_hot) +print(one_hot.shape) # (4x60x51) (number_of_sentences x max_words_in_sentence x number_of_unique_words) \ No newline at end of file From 6d0ad9cdcf3e751e7d3922e495c40e09566e84a3 Mon Sep 17 00:00:00 2001 From: ssz3 <78745174+ssz3@users.noreply.github.com> Date: Mon, 18 Jul 2022 22:09:18 -0400 Subject: [PATCH 5/8] Create hugging_face_tokenize.py --- .../hugging_face_tokenize.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 preprocess-hugging-face/hugging_face_tokenize.py diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py new file mode 100644 index 0000000..0395fb3 --- /dev/null +++ b/preprocess-hugging-face/hugging_face_tokenize.py @@ -0,0 +1,62 @@ +import numpy as np +import re +import json +import gzip + +def build_list_of_sentences(file): + """ + opens the file, returns the valid lines in a list of dictionaries + """ + with gzip.open(file, "rb") as gzip_file: + # example run with the first few entries: + lines = gzip_file.readlines()[:10] + paragraphs = [] + for line in lines: + line = json.loads(line) + paragraphs.append(line["body"]) + return paragraphs + +def tokenizer(words: str): + ''' + words is a string + ''' + return re.findall(r"[\w']+|[.,!?:;]", words) +def generate_tokens(list_of_words): + ''' + list_of_words is a list of strings + ''' + tokens = [tokenizer(words) for words in list_of_words] + return tokens + +def build_one_hot_from_tokens(tokens, max_length): + ''' + tokens should be a list of list of words + max_length is maximum length of all passages, for our project this will be 60 + ''' + # build an index of all tokens in the data + token_index = {} + i = 1 + for sample in tokens: + + for word in sample: + if word not in token_index: + # Assign a unique index to each unique word + token_index[word] = i + i += 1 + + # vectorize our tokens + results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1)) + for i, sample in enumerate(tokens): + for j, word in enumerate(sample[:max_length]): + index = token_index.get(word) + results[i, j, index] = 1. + + return results + +list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") +tokens = generate_tokens(list_of_sentences) +one_hot = build_one_hot_from_tokens(tokens, 60) + +print(list_of_sentences) +print(one_hot) +print(one_hot.shape) # (4x60x51) \ No newline at end of file From c2b54e5deaaeaa442113dd8b8648e6d24fd98688 Mon Sep 17 00:00:00 2001 From: ssz3 <78745174+ssz3@users.noreply.github.com> Date: Fri, 5 Aug 2022 18:02:45 -0400 Subject: [PATCH 6/8] Update hugging_face_tokenize.py --- preprocess-hugging-face/hugging_face_tokenize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py index 0395fb3..da25f14 100644 --- a/preprocess-hugging-face/hugging_face_tokenize.py +++ b/preprocess-hugging-face/hugging_face_tokenize.py @@ -54,6 +54,7 @@ def build_one_hot_from_tokens(tokens, max_length): return results list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") +# list_of_sentences = ["I am testing this function. Yes I am."] tokens = generate_tokens(list_of_sentences) one_hot = build_one_hot_from_tokens(tokens, 60) From c3700b696526e30a73e3625bd02e80265ca8c5b8 Mon Sep 17 00:00:00 2001 From: ssz3 <78745174+ssz3@users.noreply.github.com> Date: Sat, 24 Sep 2022 15:16:20 -0400 Subject: [PATCH 7/8] Update hugging_face_tokenize.py --- preprocess-hugging-face/hugging_face_tokenize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py index da25f14..3d4630e 100644 --- a/preprocess-hugging-face/hugging_face_tokenize.py +++ b/preprocess-hugging-face/hugging_face_tokenize.py @@ -53,8 +53,8 @@ def build_one_hot_from_tokens(tokens, max_length): return results -list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") -# list_of_sentences = ["I am testing this function. Yes I am."] +# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") +list_of_sentences = ["I am testing this function. Yes I am."] tokens = generate_tokens(list_of_sentences) one_hot = build_one_hot_from_tokens(tokens, 60) From f0dd69b844ac6b843bb49b91bd5ac5e64d0ed5c9 Mon Sep 17 00:00:00 2001 From: ssz3 <78745174+ssz3@users.noreply.github.com> Date: Sat, 24 Sep 2022 16:10:51 -0400 Subject: [PATCH 8/8] started split title function --- .../hugging_face_tokenize.py | 6 ++-- preprocess-hugging-face/sara_function.py | 30 +++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 preprocess-hugging-face/sara_function.py diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py index 3d4630e..cde1bf9 100644 --- a/preprocess-hugging-face/hugging_face_tokenize.py +++ b/preprocess-hugging-face/hugging_face_tokenize.py @@ -58,6 +58,6 @@ def build_one_hot_from_tokens(tokens, max_length): tokens = generate_tokens(list_of_sentences) one_hot = build_one_hot_from_tokens(tokens, 60) -print(list_of_sentences) -print(one_hot) -print(one_hot.shape) # (4x60x51) \ No newline at end of file +# print(list_of_sentences) +# print(one_hot) +# print(one_hot.shape) # (4x60x51) \ No newline at end of file diff --git a/preprocess-hugging-face/sara_function.py b/preprocess-hugging-face/sara_function.py new file mode 100644 index 0000000..be3d140 --- /dev/null +++ b/preprocess-hugging-face/sara_function.py @@ -0,0 +1,30 @@ +from hugging_face_tokenize import * + +def split_one_hot_title(title, cutoff): + ''' + title is a 1x60xm array. m is the number of unique tokens. + cutoff is an integer index of title. cutoff < len(title) + The first n words are kep. The (n+1)th word on are cut off + ''' + + # better error catching later + if cutoff >= title.shape[1]: + print('invalid string indexing') + return 'didn\'t', 'work' + + #? why the extra layers + #! i can't explain the second 0 yet + before_cutoff = title[:][0][0:cutoff] + next_word = title[:][0][cutoff] + + + return before_cutoff, next_word + +# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") +list_of_sentences = ["I am testing this function. Yes I am."] +tokens = generate_tokens(list_of_sentences) +one_hot_title = build_one_hot_from_tokens(tokens, 60) + +[before_cutoff, next_word] = split_one_hot_title(one_hot_title, 0) +print(before_cutoff) +print(next_word)