Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ venv/
.vscode/
.DS_store/

# json datasets
*/*.json
# datasets
*.json
*.jsonl.gz

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
28 changes: 28 additions & 0 deletions preprocess-hugging-face/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import json
import gzip
import re
import csv

# hugging fact dataset: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main
filename = 'reddit_title_text_2011.jsonl.gz'

json_content = []
can_be_used = total_posts = 0
with gzip.open(filename , 'rb') as gzip_file:
for line in gzip_file:
line = line.rstrip()
if line:
obj = json.loads(line)
json_content.append(obj)
# re.findall includes punctuation
title_split = re.findall(r"[\w']+|[.,!?:;]", obj["title"])
body_split = re.findall(r"[\w']+|[.,!?:;]", obj["body"])
# titles between 3 to 15 words and body between 15 and 60 words (including punctuation)
if 3 <= len(title_split) <= 15 and 15 <= len(body_split) <= 60:
can_be_used += 1
total_posts += 1

with open("hugging_face_cumulative_stats.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Can be used", can_be_used])
writer.writerow(["Total posts", total_posts])
2 changes: 2 additions & 0 deletions preprocess-hugging-face/hugging_face_cumulative_stats.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Can be used,451542
Total posts,1673264
63 changes: 63 additions & 0 deletions preprocess-hugging-face/hugging_face_tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import numpy as np
import re
import json
import gzip

def build_list_of_sentences(file):
"""
opens the file, returns the valid lines in a list of dictionaries
"""
with gzip.open(file, "rb") as gzip_file:
# example run with the first few entries:
lines = gzip_file.readlines()[:10]
paragraphs = []
for line in lines:
line = json.loads(line)
paragraphs.append(line["body"])
return paragraphs

def tokenizer(words: str):
'''
words is a string
'''
return re.findall(r"[\w']+|[.,!?:;]", words)
def generate_tokens(list_of_words):
'''
list_of_words is a list of strings
'''
tokens = [tokenizer(words) for words in list_of_words]
return tokens

def build_one_hot_from_tokens(tokens, max_length):
'''
tokens should be a list of list of words
max_length is maximum length of all passages, for our project this will be 60
'''
# build an index of all tokens in the data
token_index = {}
i = 1
for sample in tokens:

for word in sample:
if word not in token_index:
# Assign a unique index to each unique word
token_index[word] = i
i += 1

# vectorize our tokens
results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(tokens):
for j, word in enumerate(sample[:max_length]):
index = token_index.get(word)
results[i, j, index] = 1.

return results

# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
list_of_sentences = ["I am testing this function. Yes I am."]
tokens = generate_tokens(list_of_sentences)
one_hot = build_one_hot_from_tokens(tokens, 60)

# print(list_of_sentences)
# print(one_hot)
# print(one_hot.shape) # (4x60x51)
30 changes: 30 additions & 0 deletions preprocess-hugging-face/sara_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from hugging_face_tokenize import *

def split_one_hot_title(title, cutoff):
'''
title is a 1x60xm array. m is the number of unique tokens.
cutoff is an integer index of title. cutoff < len(title)
The first n words are kep. The (n+1)th word on are cut off
'''

# better error catching later
if cutoff >= title.shape[1]:
print('invalid string indexing')
return 'didn\'t', 'work'

#? why the extra layers
#! i can't explain the second 0 yet
before_cutoff = title[:][0][0:cutoff]
next_word = title[:][0][cutoff]


return before_cutoff, next_word

# list_of_sentences = build_list_of_sentences(r"C:\Users\ssara\OneDrive\Documents\SCHOOL\programming\CloudAI\Title-Generator\preprocess-hugging-face\reddit_title_text_2011.jsonl.gz")
list_of_sentences = ["I am testing this function. Yes I am."]
tokens = generate_tokens(list_of_sentences)
one_hot_title = build_one_hot_from_tokens(tokens, 60)

[before_cutoff, next_word] = split_one_hot_title(one_hot_title, 0)
print(before_cutoff)
print(next_word)
8 changes: 8 additions & 0 deletions preprocess-hugging-face/to_do.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
### Preprocessing, Tokenizing, One-Hot
1. Pull the `preprocess-hugging-face` branch from remote
* https://github.com/CloudClub-uoft/Title-Generator/tree/preprocess-hugging-face
2. look at `tifu_tokenize.py` under `/preprocess` or `/preprocess-tifu` directory
* this is the code I used to tokenize and one-hot encode the tifu dataset, which we're probably not using (since we found the hugging face one which is better)
3. Copy most of the code to tokenize and one-hot encode the `reddit_title_text_2011.jsonl.gz` dataset
* first download [here](https://huggingface.co/datasets/sentence-transformers/reddit-title-body/tree/main)
* confirm that all words are lowercase first before you tokenize! Might have to do `.lower()`
File renamed without changes.
File renamed without changes.
59 changes: 59 additions & 0 deletions preprocess-tifu/tifu_tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np
import re
import json

def build_list_of_sentences(file):
with open(file) as f:
# example run with the first few entries:
lines = f.readlines()[:10]
paragraphs = []
for line in lines:
line = json.loads(line)
if line["tldr"] is not None:
paragraphs.append(line["tldr"])
return paragraphs

def tokenizer(words: str):
'''
words is a string
'''
return re.findall(r"[\w']+|[.,!?:;]", words)
def generate_tokens(list_of_words):
'''
list_of_words is a list of strings
'''
tokens = [tokenizer(words) for words in list_of_words]
return tokens

def build_one_hot_from_tokens(tokens, max_length):
'''
tokens should be a list of list of words
max_length is maximum length of all passages, for our project this will be 60
'''
# build an index of all tokens in the data
token_index = {}
i = 1
for sample in tokens:

for word in sample:
if word not in token_index:
# Assign a unique index to each unique word
token_index[word] = i
i += 1

# vectorize our tokens
results = np.zeros((len(tokens), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(tokens):
for j, word in enumerate(sample[:max_length]):
index = token_index.get(word)
results[i, j, index] = 1.

return results

list_of_sentences = build_list_of_sentences("tifu_all_tokenized_and_filtered.json")
tokens = generate_tokens(list_of_sentences)
one_hot = build_one_hot_from_tokens(tokens, 60)

print(list_of_sentences)
print(one_hot)
print(one_hot.shape) # (4x60x51)
2 changes: 1 addition & 1 deletion preprocess/tifu_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ def build_one_hot_from_tokens(tokens, max_length):
one_hot = build_one_hot_from_tokens(tokens, 60)

print(one_hot)
print(one_hot.shape) # (4x60x51)
print(one_hot.shape) # (4x60x51) (number_of_sentences x max_words_in_sentence x number_of_unique_words)