forked from Ganesh2409/Course-Recommendation-System
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCourseRecommendationSystem.py
More file actions
133 lines (74 loc) · 3.39 KB
/
CourseRecommendationSystem.py
File metadata and controls
133 lines (74 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print('Dependencies Imported')
data = pd.read_csv("Data/Coursera.csv")
data.head(5)
"""# Basic Data Analysis"""
data.shape #3522 courses and 7 columns with different attributes
data.info()
data.isnull().sum() #no value is missing
data['Difficulty Level'].value_counts()
data['Course Rating'].value_counts()
data['University'].value_counts()
data['Course Name']
data = data[['Course Name','Difficulty Level','Course Description','Skills']]
data.head(5)
# Removing spaces between the words (Lambda funtions can be used as well)
data['Course Name'] = data['Course Name'].str.replace(' ',',')
data['Course Name'] = data['Course Name'].str.replace(',,',',')
data['Course Name'] = data['Course Name'].str.replace(':','')
data['Course Description'] = data['Course Description'].str.replace(' ',',')
data['Course Description'] = data['Course Description'].str.replace(',,',',')
data['Course Description'] = data['Course Description'].str.replace('_','')
data['Course Description'] = data['Course Description'].str.replace(':','')
data['Course Description'] = data['Course Description'].str.replace('(','')
data['Course Description'] = data['Course Description'].str.replace(')','')
#removing paranthesis from skills columns
data['Skills'] = data['Skills'].str.replace('(','')
data['Skills'] = data['Skills'].str.replace(')','')
data.head(5)
data['tags'] = data['Course Name'] + data['Difficulty Level'] + data['Course Description'] + data['Skills']
data.head(5)
data['tags'].iloc[1]
new_df = data[['Course Name','tags']]
new_df.head(5)
new_df['tags'] = data['tags'].str.replace(',',' ')
new_df['Course Name'] = data['Course Name'].str.replace(',',' ')
new_df.rename(columns = {'Course Name':'course_name'}, inplace = True)
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower()) #lower casing the tags column
new_df.head(5)
new_df.shape #3522 courses with tags and 2 columns (course_name and tags)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
print(vectors[0])
#!pip install nltk
import nltk #for stemming process
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
#defining the stemming function
def stem(text):
y=[]
for i in text.split():
y.append(ps.stem(i))
return " ".join(y)
new_df['tags'] = new_df['tags'].apply(stem) #applying stemming on the tags column
"""# Similarity Measure"""
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
"""# Recommendation Function"""
def recommend(course):
course_index = new_df[new_df['course_name'] == course].index[0]
distances = similarity[course_index]
course_list = sorted(list(enumerate(distances)),reverse=True, key=lambda x:x[1])[1:7]
for i in course_list:
print(new_df.iloc[i[0]].course_name)
recommend('Business Strategy Business Model Canvas Analysis with Miro')
# Exporting the Model
import pickle
pickle.dump(similarity,open('models/similarity.pkl','wb'))
pickle.dump(new_df.to_dict(),open('models/course_list.pkl','wb')) #contains the dataframe in dict
pickle.dump(new_df,open('models/courses.pkl','wb'))