diff --git a/.gitignore b/.gitignore index f9426c1..e530f86 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,9 @@ venv/ .vscode/ .DS_store/ -# json datasets -*/*.json +# datasets +*.json +*.jsonl.gz # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/preprocess-hugging-face/analysis.py b/preprocess-hugging-face/analysis.py new file mode 100644 index 0000000..1c71077 --- /dev/null +++ b/preprocess-hugging-face/analysis.py @@ -0,0 +1,28 @@ +import json +import gzip +import re +import csv + +# hugging fact dataset: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main +filename = 'reddit_title_text_2011.jsonl.gz' + +json_content = [] +can_be_used = total_posts = 0 +with gzip.open(filename , 'rb') as gzip_file: + for line in gzip_file: + line = line.rstrip() + if line: + obj = json.loads(line) + json_content.append(obj) + # re.findall includes punctuation + title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"]) + body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"]) + # titles between 3 to 15 words and body between 15 and 60 words (including punctuation) + if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60: + can_be_used += 1 + total_posts += 1 + +with open("hugging_face_cumulative_stats.csv", "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["Can be used", can_be_used]) + writer.writerow(["Total posts", total_posts]) diff --git a/preprocess-hugging-face/hugging_face_cumulative_stats.csv b/preprocess-hugging-face/hugging_face_cumulative_stats.csv new file mode 100644 index 0000000..75ce903 --- /dev/null +++ b/preprocess-hugging-face/hugging_face_cumulative_stats.csv @@ -0,0 +1,2 @@ +Can be used,451542 +Total posts,1673264 diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py new file mode 100644 index 0000000..cde1bf9 --- /dev/null +++ b/preprocess-hugging-face/hugging_face_tokenize.py @@ -0,0 +1,63 @@ +import numpy as np +import re +import json +import gzip + +def build_list_of_sentences(file): + """ + opens the file, returns the valid lines in a list of dictionaries + """ + with gzip.open(file, "rb") as gzip_file: + # example run with the first few entries: + lines = gzip_file.readlines()[:10] + paragraphs = [] + for line in lines: + line = json.loads(line) + paragraphs.append(line["body"]) + return paragraphs + +def tokenizer(words: str): + ''' + words is a string + ''' + return re.findall(r"[\w']+|[.,!?:;]", words) +def generate_tokens(list_of_words): + ''' + list_of_words is a list of strings + ''' + tokens = [tokenizer(words) for words in list_of_words] + return tokens + +def build_one_hot_from_tokens(tokens, max_length): + ''' + tokens should be a list of list of words + max_length is maximum length of all passages, for our project this will be 60 + ''' + # build an index of all tokens in the data + token_index = {} + i = 1 + for sample in tokens: + + for word in sample: + if word not in token_index: + # Assign a unique index to each unique word + token_index[word] = i + i += 1 + + # vectorize our tokens + results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1)) + for i, sample in enumerate(tokens): + for j, word in enumerate(sample[:max_length]): + index = token_index.get(word) + results[i, j, index] = 1. + + return results + +# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") +list_of_sentences = ["I am testing this function. Yes I am."] +tokens = generate_tokens(list_of_sentences) +one_hot = build_one_hot_from_tokens(tokens, 60) + +# print(list_of_sentences) +# print(one_hot) +# print(one_hot.shape) # (4x60x51) \ No newline at end of file diff --git a/preprocess-hugging-face/sara_function.py b/preprocess-hugging-face/sara_function.py new file mode 100644 index 0000000..be3d140 --- /dev/null +++ b/preprocess-hugging-face/sara_function.py @@ -0,0 +1,30 @@ +from hugging_face_tokenize import * + +def split_one_hot_title(title, cutoff): + ''' + title is a 1x60xm array. m is the number of unique tokens. + cutoff is an integer index of title. cutoff < len(title) + The first n words are kep. The (n+1)th word on are cut off + ''' + + # better error catching later + if cutoff >= title.shape[1]: + print('invalid string indexing') + return 'didn\'t', 'work' + + #? why the extra layers + #! i can't explain the second 0 yet + before_cutoff = title[:][0][0:cutoff] + next_word = title[:][0][cutoff] + + + return before_cutoff, next_word + +# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz") +list_of_sentences = ["I am testing this function. Yes I am."] +tokens = generate_tokens(list_of_sentences) +one_hot_title = build_one_hot_from_tokens(tokens, 60) + +[before_cutoff, next_word] = split_one_hot_title(one_hot_title, 0) +print(before_cutoff) +print(next_word) diff --git a/preprocess-hugging-face/to_do.md b/preprocess-hugging-face/to_do.md new file mode 100644 index 0000000..2a2198d --- /dev/null +++ b/preprocess-hugging-face/to_do.md @@ -0,0 +1,8 @@ +### Preprocessing, Tokenizing, One-Hot +1. Pull the `preprocess-hugging-face` branch from remote + * https://github.com/CloudClub-uoft/Title-Generator/tree/preprocess-hugging-face +2. look at `tifu_tokenize.py` under `/preprocess` or `/preprocess-tifu` directory + * this is the code I used to tokenize and one-hot encode the tifu dataset, which we're probably not using (since we found the hugging face one which is better) +3. Copy most of the code to tokenize and one-hot encode the `reddit_title_text_2011.jsonl.gz` dataset + * first download [here](https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main) + * confirm that all words are lowercase first before you tokenize! Might have to do `.lower()` diff --git a/preprocess/tifu_analysis.py b/preprocess-tifu/tifu_analysis.py similarity index 100% rename from preprocess/tifu_analysis.py rename to preprocess-tifu/tifu_analysis.py diff --git a/preprocess/tifu_cumulative_stats.csv b/preprocess-tifu/tifu_cumulative_stats.csv similarity index 100% rename from preprocess/tifu_cumulative_stats.csv rename to preprocess-tifu/tifu_cumulative_stats.csv diff --git a/preprocess/tifu_row_stats.csv b/preprocess-tifu/tifu_row_stats.csv similarity index 100% rename from preprocess/tifu_row_stats.csv rename to preprocess-tifu/tifu_row_stats.csv diff --git a/preprocess-tifu/tifu_tokenize.py b/preprocess-tifu/tifu_tokenize.py new file mode 100644 index 0000000..a7fbcd3 --- /dev/null +++ b/preprocess-tifu/tifu_tokenize.py @@ -0,0 +1,59 @@ +import numpy as np +import re +import json + +def build_list_of_sentences(file): + with open(file) as f: + # example run with the first few entries: + lines = f.readlines()[:10] + paragraphs = [] + for line in lines: + line = json.loads(line) + if line["tldr"] is not None: + paragraphs.append(line["tldr"]) + return paragraphs + +def tokenizer(words: str): + ''' + words is a string + ''' + return re.findall(r"[\w']+|[.,!?:;]", words) +def generate_tokens(list_of_words): + ''' + list_of_words is a list of strings + ''' + tokens = [tokenizer(words) for words in list_of_words] + return tokens + +def build_one_hot_from_tokens(tokens, max_length): + ''' + tokens should be a list of list of words + max_length is maximum length of all passages, for our project this will be 60 + ''' + # build an index of all tokens in the data + token_index = {} + i = 1 + for sample in tokens: + + for word in sample: + if word not in token_index: + # Assign a unique index to each unique word + token_index[word] = i + i += 1 + + # vectorize our tokens + results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1)) + for i, sample in enumerate(tokens): + for j, word in enumerate(sample[:max_length]): + index = token_index.get(word) + results[i, j, index] = 1. + + return results + +list_of_sentences = build_list_of_sentences("tifu_all_tokenized_and_filtered.json") +tokens = generate_tokens(list_of_sentences) +one_hot = build_one_hot_from_tokens(tokens, 60) + +print(list_of_sentences) +print(one_hot) +print(one_hot.shape) # (4x60x51) \ No newline at end of file diff --git a/preprocess/tifu_tokenize.py b/preprocess/tifu_tokenize.py index caa8689..438ac1e 100644 --- a/preprocess/tifu_tokenize.py +++ b/preprocess/tifu_tokenize.py @@ -55,4 +55,4 @@ def build_one_hot_from_tokens(tokens, max_length): one_hot = build_one_hot_from_tokens(tokens, 60) print(one_hot) -print(one_hot.shape) # (4x60x51) \ No newline at end of file +print(one_hot.shape) # (4x60x51) (number_of_sentences x max_words_in_sentence x number_of_unique_words) \ No newline at end of file