-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtfidf.py
More file actions
37 lines (28 loc) · 952 Bytes
/
tfidf.py
File metadata and controls
37 lines (28 loc) · 952 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import math
# document -> array of word
def frequency(word, document):
return document.count(word)
# def hitung_banyak_kata(document):
# return len(document)
def tf(kata, document):
freq = frequency(kata, document)
if freq > 0 :
return 1.0 +math.log(freq)
else:
return 0.0
# def hitung_banyak_dokumen(kata, kumpulan_dokumen):
# jumlah = 0
# for document in kumpulan_dokumen :
# if frequency(kata, document) > 0:
# jumlah += 1
# return 1 + jumlah
def hitung_banyak_dokumen(kata, kumpulan_dokumen):
jumlah = 0
for document in kumpulan_dokumen :
if kata in document:
jumlah += 1
return 1 + jumlah
def idf(kata, kumpulan_dokumen):
return math.log(1+(float(len(kumpulan_dokumen)) / float(hitung_banyak_dokumen(kata, kumpulan_dokumen))))
def tfidf(kata, document, kumpulan_dokumen):
return (tf(kata, document) * idf(kata, kumpulan_dokumen))