CloudClub-uoft · ssarazia · Jun 16, 2022 · Jun 17, 2022 · Jun 21, 2022 · Jul 9, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,8 +2,9 @@ venv/
 .vscode/
 .DS_store/
 
-# json datasets
-*/*.json
+# datasets
+*.json
+*.jsonl.gz
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/preprocess-hugging-face/analysis.py b/preprocess-hugging-face/analysis.py
@@ -0,0 +1,28 @@
+import json
+import gzip
+import re
+import csv
+
+# hugging fact dataset: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main
+filename = 'reddit_title_text_2011.jsonl.gz'
+
+json_content = []
+can_be_used = total_posts = 0
+with gzip.open(filename , 'rb') as gzip_file:
+    for line in gzip_file:
+        line = line.rstrip()
+        if line:
+            obj = json.loads(line)
+            json_content.append(obj)
+            # re.findall includes punctuation
+            title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"])
+            body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"])
+            # titles between 3 to 15 words and body between 15 and 60 words (including punctuation)
+            if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60:
+                can_be_used += 1
+            total_posts += 1
+
+with open("hugging_face_cumulative_stats.csv", "w", newline="") as f:
+    writer = csv.writer(f)
+    writer.writerow(["Can be used", can_be_used])
+    writer.writerow(["Total posts", total_posts])
diff --git a/preprocess-hugging-face/hugging_face_cumulative_stats.csv b/preprocess-hugging-face/hugging_face_cumulative_stats.csv
@@ -0,0 +1,2 @@
+Can be used,451542
+Total posts,1673264
diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py
@@ -0,0 +1,63 @@
+import numpy as np
+import re
+import json
+import gzip
+
+def build_list_of_sentences(file):
+    """ 
+    opens the file, returns the valid lines in a list of dictionaries 
+    """
+    with gzip.open(file, "rb") as gzip_file:
+        # example run with the first few entries:
+        lines = gzip_file.readlines()[:10] 
+        paragraphs = []
+        for line in lines: 
+            line = json.loads(line) 
+            paragraphs.append(line["body"]) 
+        return paragraphs 
+
+def tokenizer(words: str):
+    '''
+    words is a string
+    '''
+    return re.findall(r"[\w']+|[.,!?:;]", words)
+def generate_tokens(list_of_words):
+    '''
+    list_of_words is a list of strings
+    '''
+    tokens = [tokenizer(words) for words in list_of_words]
+    return tokens
+
+def build_one_hot_from_tokens(tokens, max_length):
+    '''
+    tokens should be a list of list of words
+    max_length is maximum length of all passages, for our project this will be 60
+    '''
+    # build an index of all tokens in the data
+    token_index = {}
+    i = 1
+    for sample in tokens:
+
+        for word in sample:
+            if word not in token_index:
+                # Assign a unique index to each unique word
+                token_index[word] = i
+                i += 1
+
+    # vectorize our tokens
+    results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1))
+    for i, sample in enumerate(tokens):
+        for j, word in enumerate(sample[:max_length]):
+            index = token_index.get(word)
+            results[i, j, index] = 1.
+
+    return results
+
+# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
+list_of_sentences = ["I am testing this function. Yes I am."]
+tokens = generate_tokens(list_of_sentences)
+one_hot = build_one_hot_from_tokens(tokens, 60)
+
+# print(list_of_sentences)
+# print(one_hot)
+# print(one_hot.shape) # (4x60x51)
diff --git a/preprocess-hugging-face/sara_function.py b/preprocess-hugging-face/sara_function.py
@@ -0,0 +1,30 @@
+from hugging_face_tokenize import *
+
+def split_one_hot_title(title, cutoff):
+    '''
+    title is a 1x60xm array. m is the number of unique tokens. 
+    cutoff is an integer index of title. cutoff < len(title)
+    The first n words are kep. The (n+1)th word on are cut off 
+    '''
+
+    # better error catching later
+    if cutoff >= title.shape[1]:
+        print('invalid string indexing')
+        return 'didn\'t', 'work'
+
+    #? why the extra layers
+    #! i can't explain the second 0 yet
+    before_cutoff = title[:][0][0:cutoff] 
+    next_word = title[:][0][cutoff]
+
+
+    return before_cutoff, next_word
+
+# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
+list_of_sentences = ["I am testing this function. Yes I am."]
+tokens = generate_tokens(list_of_sentences)
+one_hot_title = build_one_hot_from_tokens(tokens, 60)
+
+[before_cutoff, next_word] = split_one_hot_title(one_hot_title, 0)
+print(before_cutoff)
+print(next_word)
diff --git a/preprocess-hugging-face/to_do.md b/preprocess-hugging-face/to_do.md
@@ -0,0 +1,8 @@
+### Preprocessing, Tokenizing, One-Hot
+1. Pull the `preprocess-hugging-face` branch from remote
+    * https://github.com/CloudClub-uoft/Title-Generator/tree/preprocess-hugging-face
+2. look at `tifu_tokenize.py` under `/preprocess` or `/preprocess-tifu` directory
+    * this is the code I used to tokenize and one-hot encode the tifu dataset, which we're probably not using (since we found the hugging face one which is better)
+3. Copy most of the code to tokenize and one-hot encode the `reddit_title_text_2011.jsonl.gz` dataset
+    * first download [here](https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main)
+    * confirm that all words are lowercase first before you tokenize! Might have to do `.lower()`
diff --git a/preprocess/tifu_analysis.py → preprocess-tifu/tifu_analysis.py b/preprocess/tifu_analysis.py → preprocess-tifu/tifu_analysis.py
diff --git a/preprocess/tifu_cumulative_stats.csv → preprocess-tifu/tifu_cumulative_stats.csv b/preprocess/tifu_cumulative_stats.csv → preprocess-tifu/tifu_cumulative_stats.csv
diff --git a/preprocess/tifu_row_stats.csv → preprocess-tifu/tifu_row_stats.csv b/preprocess/tifu_row_stats.csv → preprocess-tifu/tifu_row_stats.csv
diff --git a/preprocess-tifu/tifu_tokenize.py b/preprocess-tifu/tifu_tokenize.py
@@ -0,0 +1,59 @@
+import numpy as np
+import re
+import json
+
+def build_list_of_sentences(file):
+    with open(file) as f:
+        # example run with the first few entries:
+        lines = f.readlines()[:10]
+        paragraphs = []
+        for line in lines:
+            line = json.loads(line)
+            if line["tldr"] is not None:
+                paragraphs.append(line["tldr"])
+        return paragraphs
+
+def tokenizer(words: str):
+    '''
+    words is a string
+    '''
+    return re.findall(r"[\w']+|[.,!?:;]", words)
+def generate_tokens(list_of_words):
+    '''
+    list_of_words is a list of strings
+    '''
+    tokens = [tokenizer(words) for words in list_of_words]
+    return tokens
+
+def build_one_hot_from_tokens(tokens, max_length):
+    '''
+    tokens should be a list of list of words
+    max_length is maximum length of all passages, for our project this will be 60
+    '''
+    # build an index of all tokens in the data
+    token_index = {}
+    i = 1
+    for sample in tokens:
+
+        for word in sample:
+            if word not in token_index:
+                # Assign a unique index to each unique word
+                token_index[word] = i
+                i += 1
+
+    # vectorize our tokens
+    results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1))
+    for i, sample in enumerate(tokens):
+        for j, word in enumerate(sample[:max_length]):
+            index = token_index.get(word)
+            results[i, j, index] = 1.
+
+    return results
+
+list_of_sentences = build_list_of_sentences("tifu_all_tokenized_and_filtered.json")
+tokens = generate_tokens(list_of_sentences)
+one_hot = build_one_hot_from_tokens(tokens, 60)
+
+print(list_of_sentences)
+print(one_hot)
+print(one_hot.shape) # (4x60x51)
diff --git a/preprocess/tifu_tokenize.py b/preprocess/tifu_tokenize.py
@@ -55,4 +55,4 @@ def build_one_hot_from_tokens(tokens, max_length):
 one_hot = build_one_hot_from_tokens(tokens, 60)
 
 print(one_hot)
-print(one_hot.shape) # (4x60x51)
+print(one_hot.shape) # (4x60x51) (number_of_sentences x max_words_in_sentence x number_of_unique_words)