-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaiplay.py
More file actions
104 lines (90 loc) · 4.06 KB
/
aiplay.py
File metadata and controls
104 lines (90 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# Step 1: Parse the JSONL data
data = []
with open('data_converted.jsonl', 'r') as file: # Replace with your actual file name
for line in file:
entry = json.loads(line)
# Join the mood array into a string
mood_text = ", ".join(entry["contents"][0].get("mood", []))
user_text = entry["contents"][0]["parts"][0]["text"] + " mood: " + mood_text
artID = entry["contents"][1]["parts"][1]["artID"][0] # Extract the first ArtID
reply = entry["contents"][1]["parts"][0]["text"]
data.append({"text": user_text, "artID": artID})
# Step 2: Prepare data for training
texts = [entry["text"] for entry in data]
artIDs = [entry["artID"] for entry in data]
# Tokenize the texts
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>") # Limit vocabulary size for small datasets
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=10)
# Map ArtIDs to integers
artID_to_index = {artID: idx for idx, artID in enumerate(set(artIDs))}
index_to_artID = {idx: artID for artID, idx in artID_to_index.items()}
y = np.array([artID_to_index[artID] for artID in artIDs])
# Convert output to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=len(artID_to_index))
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)
# Step 3: Load GloVe embeddings
embedding_dim = 200
embedding_index = {}
with open("glove.6B.200d.txt", "r", encoding="utf-8") as f: # Replace with the path to your GloVe file
for line in f:
values = line.split()
word = values[0]
coefficients = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefficients
# Create embedding matrix
word_index = tokenizer.word_index
num_words = min(1000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
if i < num_words:
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Step 4: Build the model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=num_words,
output_dim=embedding_dim,
weights=[embedding_matrix],
input_length=10,
trainable=False), # Use pre-trained embeddings
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)), # Bi-directional LSTM
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(0.3), # Dropout for regularization
tf.keras.layers.Dense(len(artID_to_index), activation='softmax')
])
# Compile the model with a lower learning rate for stability
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), # Fine-tuned learning rate
loss='categorical_crossentropy',
metrics=['accuracy']
)
# Learning rate scheduler
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss', factor=0.5, patience=3, verbose=1
)
# Step 5: Train the model
history = model.fit(
X_train, y_train,
epochs=15, # Increased epochs for better learning
validation_data=(X_test, y_test),
batch_size=16, # Smaller batch size for small datasets
verbose=2,
callbacks=[lr_scheduler] # Add the learning rate scheduler here
)
# Step 6: Test the model
test_text = ["The weather is nice! mood: happy, excited"] # Example test input
test_seq = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_seq, padding='post', maxlen=20)
predicted_class = np.argmax(model.predict(test_padded), axis=-1)
# Get the predicted ArtID
predicted_artID = index_to_artID[predicted_class[0]]
print(f"Recommended ArtID: {predicted_artID}")