From 76456c1e82e83fcf748376c63c47d55f50b5f6e8 Mon Sep 17 00:00:00 2001
From: Qingyuan Wu <qyw.wu@mail.utoronto.ca>
Date: Thu, 16 Jun 2022 16:15:01 -0400
Subject: [PATCH 1/8] rename preprocess to preprocess-tifu

---
 {preprocess => preprocess-tifu}/tifu_analysis.py          | 0
 {preprocess => preprocess-tifu}/tifu_cumulative_stats.csv | 0
 {preprocess => preprocess-tifu}/tifu_row_stats.csv        | 0
 {preprocess => preprocess-tifu}/tifu_tokenize.py          | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {preprocess => preprocess-tifu}/tifu_analysis.py (100%)
 rename {preprocess => preprocess-tifu}/tifu_cumulative_stats.csv (100%)
 rename {preprocess => preprocess-tifu}/tifu_row_stats.csv (100%)
 rename {preprocess => preprocess-tifu}/tifu_tokenize.py (100%)

diff --git a/preprocess/tifu_analysis.py b/preprocess-tifu/tifu_analysis.py
similarity index 100%
rename from preprocess/tifu_analysis.py
rename to preprocess-tifu/tifu_analysis.py
diff --git a/preprocess/tifu_cumulative_stats.csv b/preprocess-tifu/tifu_cumulative_stats.csv
similarity index 100%
rename from preprocess/tifu_cumulative_stats.csv
rename to preprocess-tifu/tifu_cumulative_stats.csv
diff --git a/preprocess/tifu_row_stats.csv b/preprocess-tifu/tifu_row_stats.csv
similarity index 100%
rename from preprocess/tifu_row_stats.csv
rename to preprocess-tifu/tifu_row_stats.csv
diff --git a/preprocess/tifu_tokenize.py b/preprocess-tifu/tifu_tokenize.py
similarity index 100%
rename from preprocess/tifu_tokenize.py
rename to preprocess-tifu/tifu_tokenize.py

From 7ae7dc0ed61e4789bb31157091fffd570408cdaf Mon Sep 17 00:00:00 2001
From: Qingyuan Wu <qyw.wu@mail.utoronto.ca>
Date: Fri, 17 Jun 2022 16:32:20 -0400
Subject: [PATCH 2/8] add hugging face usable data analysis

---
 .gitignore                                    |  5 ++--
 preprocess-hugging-face/analysis.py           | 26 +++++++++++++++++++
 .../hugging_face_cumulative_stats.csv         |  2 ++
 3 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 preprocess-hugging-face/analysis.py
 create mode 100644 preprocess-hugging-face/hugging_face_cumulative_stats.csv

diff --git a/.gitignore b/.gitignore
index f9426c1..e530f86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,9 @@ venv/
 .vscode/
 .DS_store/
 
-# json datasets
-*/*.json
+# datasets
+*.json
+*.jsonl.gz
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/preprocess-hugging-face/analysis.py b/preprocess-hugging-face/analysis.py
new file mode 100644
index 0000000..5c19e22
--- /dev/null
+++ b/preprocess-hugging-face/analysis.py
@@ -0,0 +1,26 @@
+import json
+import gzip
+import re
+import csv
+
+# hugging fact dataset: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main
+filename = 'reddit_title_text_2011.jsonl.gz'
+
+json_content = []
+can_be_used = total_posts = 0
+with gzip.open(filename , 'rb') as gzip_file:
+    for line in gzip_file:
+        line = line.rstrip()
+        if line:
+            obj = json.loads(line)
+            json_content.append(obj)
+            title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"])
+            body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"])
+            if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60:
+                can_be_used += 1
+            total_posts += 1
+
+with open("hugging_face_cumulative_stats.csv", "w", newline="") as f:
+    writer = csv.writer(f)
+    writer.writerow(["Can be used", can_be_used])
+    writer.writerow(["Total posts", total_posts])
diff --git a/preprocess-hugging-face/hugging_face_cumulative_stats.csv b/preprocess-hugging-face/hugging_face_cumulative_stats.csv
new file mode 100644
index 0000000..75ce903
--- /dev/null
+++ b/preprocess-hugging-face/hugging_face_cumulative_stats.csv
@@ -0,0 +1,2 @@
+Can be used,451542
+Total posts,1673264

From d78650eab3771dcd9efa4c1fb6d2c25f203265f2 Mon Sep 17 00:00:00 2001
From: Qingyuan Wu <qyw.wu@mail.utoronto.ca>
Date: Tue, 21 Jun 2022 18:15:56 -0400
Subject: [PATCH 3/8] add to_do.md file

---
 preprocess-hugging-face/analysis.py | 2 ++
 preprocess-hugging-face/to_do.md    | 8 ++++++++
 preprocess-tifu/tifu_tokenize.py    | 1 +
 3 files changed, 11 insertions(+)
 create mode 100644 preprocess-hugging-face/to_do.md

diff --git a/preprocess-hugging-face/analysis.py b/preprocess-hugging-face/analysis.py
index 5c19e22..1c71077 100644
--- a/preprocess-hugging-face/analysis.py
+++ b/preprocess-hugging-face/analysis.py
@@ -14,8 +14,10 @@
         if line:
             obj = json.loads(line)
             json_content.append(obj)
+            # re.findall includes punctuation
             title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"])
             body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"])
+            # titles between 3 to 15 words and body between 15 and 60 words (including punctuation)
             if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60:
                 can_be_used += 1
             total_posts += 1
diff --git a/preprocess-hugging-face/to_do.md b/preprocess-hugging-face/to_do.md
new file mode 100644
index 0000000..2a2198d
--- /dev/null
+++ b/preprocess-hugging-face/to_do.md
@@ -0,0 +1,8 @@
+### Preprocessing, Tokenizing, One-Hot
+1. Pull the `preprocess-hugging-face` branch from remote
+    * https://github.com/CloudClub-uoft/Title-Generator/tree/preprocess-hugging-face
+2. look at `tifu_tokenize.py` under `/preprocess` or `/preprocess-tifu` directory
+    * this is the code I used to tokenize and one-hot encode the tifu dataset, which we're probably not using (since we found the hugging face one which is better)
+3. Copy most of the code to tokenize and one-hot encode the `reddit_title_text_2011.jsonl.gz` dataset
+    * first download [here](https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main)
+    * confirm that all words are lowercase first before you tokenize! Might have to do `.lower()`
diff --git a/preprocess-tifu/tifu_tokenize.py b/preprocess-tifu/tifu_tokenize.py
index caa8689..a7fbcd3 100644
--- a/preprocess-tifu/tifu_tokenize.py
+++ b/preprocess-tifu/tifu_tokenize.py
@@ -54,5 +54,6 @@ def build_one_hot_from_tokens(tokens, max_length):
 tokens = generate_tokens(list_of_sentences)
 one_hot = build_one_hot_from_tokens(tokens, 60)
 
+print(list_of_sentences)
 print(one_hot)
 print(one_hot.shape) # (4x60x51)
\ No newline at end of file

From 7b270a15db2459aa88293c8977d4bd566989aa57 Mon Sep 17 00:00:00 2001
From: Qingyuan Wu <qyw.wu@mail.utoronto.ca>
Date: Fri, 8 Jul 2022 23:17:56 -0400
Subject: [PATCH 4/8] preprocess folder

---
 preprocess/tifu_tokenize.py | 58 +++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 preprocess/tifu_tokenize.py

diff --git a/preprocess/tifu_tokenize.py b/preprocess/tifu_tokenize.py
new file mode 100644
index 0000000..438ac1e
--- /dev/null
+++ b/preprocess/tifu_tokenize.py
@@ -0,0 +1,58 @@
+import numpy as np
+import re
+import json
+
+def build_list_of_sentences(file):
+    with open(file) as f:
+        # example run with the first few entries:
+        lines = f.readlines()[:10]
+        paragraphs = []
+        for line in lines:
+            line = json.loads(line)
+            if line["tldr"] is not None:
+                paragraphs.append(line["tldr"])
+        return paragraphs
+
+def tokenizer(words: str):
+    '''
+    words is a string
+    '''
+    return re.findall(r"[\w']+|[.,!?:;]", words)
+def generate_tokens(list_of_words):
+    '''
+    list_of_words is a list of strings
+    '''
+    tokens = [tokenizer(words) for words in list_of_words]
+    return tokens
+
+def build_one_hot_from_tokens(tokens, max_length):
+    '''
+    tokens should be a list of list of words
+    max_length is maximum length of all passages, for our project this will be 60
+    '''
+    # build an index of all tokens in the data
+    token_index = {}
+    i = 1
+    for sample in tokens:
+
+        for word in sample:
+            if word not in token_index:
+                # Assign a unique index to each unique word
+                token_index[word] = i
+                i += 1
+
+    # vectorize our tokens
+    results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1))
+    for i, sample in enumerate(tokens):
+        for j, word in enumerate(sample[:max_length]):
+            index = token_index.get(word)
+            results[i, j, index] = 1.
+
+    return results
+    
+list_of_sentences = build_list_of_sentences("tifu_all_tokenized_and_filtered.json")
+tokens = generate_tokens(list_of_sentences)
+one_hot = build_one_hot_from_tokens(tokens, 60)
+
+print(one_hot)
+print(one_hot.shape) # (4x60x51) (number_of_sentences x max_words_in_sentence x number_of_unique_words)
\ No newline at end of file

From 6d0ad9cdcf3e751e7d3922e495c40e09566e84a3 Mon Sep 17 00:00:00 2001
From: ssz3 <78745174+ssz3@users.noreply.github.com>
Date: Mon, 18 Jul 2022 22:09:18 -0400
Subject: [PATCH 5/8] Create hugging_face_tokenize.py

---
 .../hugging_face_tokenize.py                  | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 preprocess-hugging-face/hugging_face_tokenize.py

diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py
new file mode 100644
index 0000000..0395fb3
--- /dev/null
+++ b/preprocess-hugging-face/hugging_face_tokenize.py
@@ -0,0 +1,62 @@
+import numpy as np
+import re
+import json
+import gzip
+
+def build_list_of_sentences(file):
+    """ 
+    opens the file, returns the valid lines in a list of dictionaries 
+    """
+    with gzip.open(file, "rb") as gzip_file:
+        # example run with the first few entries:
+        lines = gzip_file.readlines()[:10] 
+        paragraphs = []
+        for line in lines: 
+            line = json.loads(line) 
+            paragraphs.append(line["body"]) 
+        return paragraphs 
+
+def tokenizer(words: str):
+    '''
+    words is a string
+    '''
+    return re.findall(r"[\w']+|[.,!?:;]", words)
+def generate_tokens(list_of_words):
+    '''
+    list_of_words is a list of strings
+    '''
+    tokens = [tokenizer(words) for words in list_of_words]
+    return tokens
+
+def build_one_hot_from_tokens(tokens, max_length):
+    '''
+    tokens should be a list of list of words
+    max_length is maximum length of all passages, for our project this will be 60
+    '''
+    # build an index of all tokens in the data
+    token_index = {}
+    i = 1
+    for sample in tokens:
+
+        for word in sample:
+            if word not in token_index:
+                # Assign a unique index to each unique word
+                token_index[word] = i
+                i += 1
+
+    # vectorize our tokens
+    results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1))
+    for i, sample in enumerate(tokens):
+        for j, word in enumerate(sample[:max_length]):
+            index = token_index.get(word)
+            results[i, j, index] = 1.
+
+    return results
+    
+list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
+tokens = generate_tokens(list_of_sentences)
+one_hot = build_one_hot_from_tokens(tokens, 60)
+
+print(list_of_sentences)
+print(one_hot)
+print(one_hot.shape) # (4x60x51)
\ No newline at end of file

From c2b54e5deaaeaa442113dd8b8648e6d24fd98688 Mon Sep 17 00:00:00 2001
From: ssz3 <78745174+ssz3@users.noreply.github.com>
Date: Fri, 5 Aug 2022 18:02:45 -0400
Subject: [PATCH 6/8] Update hugging_face_tokenize.py

---
 preprocess-hugging-face/hugging_face_tokenize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py
index 0395fb3..da25f14 100644
--- a/preprocess-hugging-face/hugging_face_tokenize.py
+++ b/preprocess-hugging-face/hugging_face_tokenize.py
@@ -54,6 +54,7 @@ def build_one_hot_from_tokens(tokens, max_length):
     return results
     
 list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
+# list_of_sentences = ["I am testing this function. Yes I am."]
 tokens = generate_tokens(list_of_sentences)
 one_hot = build_one_hot_from_tokens(tokens, 60)
 

From c3700b696526e30a73e3625bd02e80265ca8c5b8 Mon Sep 17 00:00:00 2001
From: ssz3 <78745174+ssz3@users.noreply.github.com>
Date: Sat, 24 Sep 2022 15:16:20 -0400
Subject: [PATCH 7/8] Update hugging_face_tokenize.py

---
 preprocess-hugging-face/hugging_face_tokenize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py
index da25f14..3d4630e 100644
--- a/preprocess-hugging-face/hugging_face_tokenize.py
+++ b/preprocess-hugging-face/hugging_face_tokenize.py
@@ -53,8 +53,8 @@ def build_one_hot_from_tokens(tokens, max_length):
 
     return results
     
-list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
-# list_of_sentences = ["I am testing this function. Yes I am."]
+# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
+list_of_sentences = ["I am testing this function. Yes I am."]
 tokens = generate_tokens(list_of_sentences)
 one_hot = build_one_hot_from_tokens(tokens, 60)
 

From f0dd69b844ac6b843bb49b91bd5ac5e64d0ed5c9 Mon Sep 17 00:00:00 2001
From: ssz3 <78745174+ssz3@users.noreply.github.com>
Date: Sat, 24 Sep 2022 16:10:51 -0400
Subject: [PATCH 8/8] started split title function

---
 .../hugging_face_tokenize.py                  |  6 ++--
 preprocess-hugging-face/sara_function.py      | 30 +++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 preprocess-hugging-face/sara_function.py

diff --git a/preprocess-hugging-face/hugging_face_tokenize.py b/preprocess-hugging-face/hugging_face_tokenize.py
index 3d4630e..cde1bf9 100644
--- a/preprocess-hugging-face/hugging_face_tokenize.py
+++ b/preprocess-hugging-face/hugging_face_tokenize.py
@@ -58,6 +58,6 @@ def build_one_hot_from_tokens(tokens, max_length):
 tokens = generate_tokens(list_of_sentences)
 one_hot = build_one_hot_from_tokens(tokens, 60)
 
-print(list_of_sentences)
-print(one_hot)
-print(one_hot.shape) # (4x60x51)
\ No newline at end of file
+# print(list_of_sentences)
+# print(one_hot)
+# print(one_hot.shape) # (4x60x51)
\ No newline at end of file
diff --git a/preprocess-hugging-face/sara_function.py b/preprocess-hugging-face/sara_function.py
new file mode 100644
index 0000000..be3d140
--- /dev/null
+++ b/preprocess-hugging-face/sara_function.py
@@ -0,0 +1,30 @@
+from hugging_face_tokenize import *
+
+def split_one_hot_title(title, cutoff):
+    '''
+    title is a 1x60xm array. m is the number of unique tokens. 
+    cutoff is an integer index of title. cutoff < len(title)
+    The first n words are kep. The (n+1)th word on are cut off 
+    '''
+
+    # better error catching later
+    if cutoff >= title.shape[1]:
+        print('invalid string indexing')
+        return 'didn\'t', 'work'
+
+    #? why the extra layers
+    #! i can't explain the second 0 yet
+    before_cutoff = title[:][0][0:cutoff] 
+    next_word = title[:][0][cutoff]
+
+
+    return before_cutoff, next_word
+
+# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
+list_of_sentences = ["I am testing this function. Yes I am."]
+tokens = generate_tokens(list_of_sentences)
+one_hot_title = build_one_hot_from_tokens(tokens, 60)
+
+[before_cutoff, next_word] = split_one_hot_title(one_hot_title, 0)
+print(before_cutoff)
+print(next_word)