-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathimport_data.py
More file actions
60 lines (52 loc) · 1.6 KB
/
import_data.py
File metadata and controls
60 lines (52 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import os
import sys
import re
################################################################################
#Import data function
def import_data(data_name, filename):
data=[]
with open(filename, encoding='ansi') as inputfile:
for line in inputfile:
data.append(line.strip())
data = np.array(data)
print("import ",data_name," of size", data.shape)
return data
#importing X
titles=[]
titles = import_data('titles', "real_data/train/titles_uniq.txt")
#importing Y
scores=[]
scores = import_data('scores',"real_data/train/score_uniq.txt")
#importing X_test
test_titles=[]
test_titles = import_data('test_titles',"real_data/test/titles_uniq.txt")
#importing Y_test
test_scores=[]
test_scores = import_data('test_scores', "real_data/test/score_uniq.txt")
print("END OF IMPORT")
print("#############################")
############################################################
def clean(titles):
import keras
from keras.preprocessing.text import text_to_word_sequence
# define the document
clean = [["" for x in range(100)] for y in range(len(titles))]
i=0
for title in titles:
# tokenize the document
#title = re.compile()
result = text_to_word_sequence(title, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', split=' ')
j=0
for word in result:
#print(i,j)
#print(word)
clean[i][j] += word
j = j +1
#print(clean[i])
i = i +1
clean = np.matrix(clean)
return clean
#titles = clean(titles)
#print(titles)
print(titles.shape)