From 63eef4f09bbfb10f2d8459d800cb67811546800b Mon Sep 17 00:00:00 2001 From: srk Date: Fri, 2 Aug 2019 17:46:28 +0530 Subject: [PATCH 1/3] Implementing fast btm --- biterm/btm.py | 57 ++++++++++++++++++++------ biterm/vose_sampler.py | 92 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 12 deletions(-) create mode 100644 biterm/vose_sampler.py diff --git a/biterm/btm.py b/biterm/btm.py index af548e9..5d0ff9c 100644 --- a/biterm/btm.py +++ b/biterm/btm.py @@ -1,6 +1,7 @@ import numpy as np from itertools import combinations, chain from tqdm import trange +from .vose_sampler import VoseAlias class oBTM: @@ -17,12 +18,27 @@ def __init__(self, num_topics, V, alpha=1., beta=0.01, l=0.5): self.beta = np.full((len(self.V), self.K), beta) self.l = l + def compute_corpus_acceptance(k_topic,proposal_topic): + doc_proposal_1 = ((n_z[k_topic] + self.alpha[k_topic])*(n_wz[b_i[0],k_topic] + self.beta[b_i[0],k_topic])*(n_wz[b_i[1],k_topic] + self.beta[b_i[0],k_topic]))/ + (n_z[proposal_topic] + self.alpha[proposal_topic])*(n_wz[b_i[0],proposal_topic] + self.beta[b_i[0],proposal_topic])*(n_wz[b_i[1],proposal_topic] + self.beta[b_i[0],proposal_topic]) + doc_proposal_2 = ((2 * n_z[proposal_topic] + self.beta[proposal_topic].sum(axis=0))**2)*(n_z[proposal_topic]+1+self.alpha[proposal_topic])/ + ((2 * n_z[k_topic] + self.beta[k_topic].sum(axis=0))**2)*(n_z[k_topic]+1+self.alpha[k_topic]) + doc_proposal = doc_proposal_1*doc_proposal_2 + return min(1,doc_proposal) + + def compute_term_acceptance(wi,wj,s_topic,t_topic): + term_proposal_1 = (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + self.beta[wi,s_topic])**2)/ + (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + self.beta[wi,t_topic])**2) + term_proposal_2 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + self.beta[wi,t_topic])/ + (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + self.beta[wi,s_topic]) + term_proposal = term_proposal*term_proposal_1 + return min(1,term_proposal) def _gibbs(self, iterations): - Z = np.zeros(len(self.B), dtype=np.int16) n_wz = np.zeros((len(self.V), self.K), dtype=int) n_z = np.zeros(self.K, dtype=int) + n_aw = np.zeros(self.V) for i, b_i in enumerate(self.B): topic = np.random.choice(self.K, 1)[0] @@ -30,23 +46,40 @@ def _gibbs(self, iterations): n_wz[b_i[1], topic] += 1 n_z[topic] += 1 Z[i] = topic + n_aw[b_i[0]] = VoseAlias(n_wz[b_i[0]]) + n_aw[b_i[1]] = VoseAlias(n_wz[b_i[1]]) + + #create alias table for each word + + n_aw[b_i[0]] = VA.n_wz[b_i[0]] for _ in trange(iterations): for i, b_i in enumerate(self.B): n_wz[b_i[0], Z[i]] -= 1 n_wz[b_i[1], Z[i]] -= 1 n_z[Z[i]] -= 1 - P_w0z = (n_wz[b_i[0], :] + self.beta[b_i[0], :]) / (2 * n_z + self.beta.sum(axis=0)) - P_w1z = (n_wz[b_i[1], :] + self.beta[b_i[1], :]) / (2 * n_z + 1 + self.beta.sum(axis=0)) - P_z = (n_z + self.alpha) * P_w0z * P_w1z - # P_z = (n_z + self.alpha) * ((n_wz[b_i[0], :] + self.beta[b_i[0], :]) * (n_wz[b_i[1], :] + self.beta[b_i[1], :]) / - # (((n_wz + self.beta).sum(axis=0) + 1) * (n_wz + self.beta).sum(axis=0))) # todo check out - P_z = P_z / P_z.sum() - Z[i] = np.random.choice(self.K, 1, p=P_z) - n_wz[b_i[0], Z[i]] += 1 - n_wz[b_i[1], Z[i]] += 1 - n_z[Z[i]] += 1 - + proposal = np.random.randint(0,1) + k_topic = Z[i] + if proposal == 0: + index = randomInt(0, len(self.V)) + proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1 + mh_acceptance = compute_corpus_acceptance(k_topic, proposal_topic) + else : + proposal_topic = n_aw[b_i[0]].alias_generation() + mh_acceptance = compute_term_acceptance(b_i[0],b_i[1],k_topic,proposal_topic) + mh_sample = randomFloat(0, 1) + if (mh_sample < mh_acceptance): + # increment_count_matrices(d, w, k) // reject proposal, revert to k + Z[i] = k + n_wz[b_i[0], Z[i]] += 1 + n_wz[b_i[1], Z[i]] += 1 + n_z[Z[i]] += 1 + else : + # increment_count_matrices(d, w, p) // accept proposal + Z[i] = p + n_wz[b_i[0], Z[i]] += 1 + n_wz[b_i[1], Z[i]] += 1 + n_z[Z[i]] += 1 return n_z, n_wz diff --git a/biterm/vose_sampler.py b/biterm/vose_sampler.py new file mode 100644 index 0000000..9d002e1 --- /dev/null +++ b/biterm/vose_sampler.py @@ -0,0 +1,92 @@ +#!/usr/bin/python + +#LIBRARIES: +# Standard library +import os +import random +import re +import sys +from decimal import * +from optparse import OptionParser +#import cProfile + +# Third-party libraries (only used temporarily for profiling) +#import memory_profiler + + +class VoseAlias(object): + """ A probability distribution for discrete weighted random variables and its probability/alias + tables for efficient sampling via Vose's Alias Method (a good explanation of which can be found at + http://www.keithschwarz.com/darts-dice-coins/). + """ + + def __init__(self, probabilities): + """ (VoseAlias, dict) -> NoneType """ + self.probabilities = probabilities + self.alias_initialisation() + self.table_prob_list = list(self.table_prob) + + def alias_initialisation(self): + """ Construct probability and alias tables for the distribution. """ + # Initialise variables + n = len(self.probabilities) + self.table_prob = {} # probability table + self.table_alias = {} # alias table + scaled_prob = {} # scaled probabilities + small = [] # stack for probabilities smaller that 1 + large = [] # stack for probabilities greater than or equal to 1 + + # Construct and sort the scaled probabilities into their appropriate stacks + for o, p in enumerate(self.probabilities): + scaled_prob[o] = Decimal(p) * n + + if scaled_prob[o] < 1: + small.append(o) + else: + large.append(o) + + # Construct the probability and alias tables + while small and large: + s = small.pop() + l = large.pop() + + self.table_prob[s] = scaled_prob[s] + self.table_alias[s] = l + + scaled_prob[l] = (scaled_prob[l] + scaled_prob[s]) - Decimal(1) + + if scaled_prob[l] < 1: + small.append(l) + else: + large.append(l) + + # The remaining outcomes (of one stack) must have probability 1 + while large: + self.table_prob[large.pop()] = Decimal(1) + + while small: + self.table_prob[small.pop()] = Decimal(1) + + def alias_generation(self): + """ Return a random outcome from the distribution. """ + # Determine which column of table_prob to inspect + col = random.choice(self.table_prob_list) + + # Determine which outcome to pick in that column + if self.table_prob[col] >= random.uniform(0,1): + return col + else: + return self.table_alias[col] + +def main(): + try: + test = VoseAlias([0.1,0.2,0.8]) + import pdb + pdb.set_trace() + print (test.alias_generation()) + except Exception as e: + sys.exit("\nError: %s" % e) + + +if __name__ == "__main__": + main() \ No newline at end of file From 9c89199619c371de372a19053f452f1acf405a4a Mon Sep 17 00:00:00 2001 From: srk Date: Fri, 2 Aug 2019 19:16:47 +0530 Subject: [PATCH 2/3] fixing code errors --- biterm/btm.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/biterm/btm.py b/biterm/btm.py index 5d0ff9c..18029a4 100644 --- a/biterm/btm.py +++ b/biterm/btm.py @@ -2,6 +2,7 @@ from itertools import combinations, chain from tqdm import trange from .vose_sampler import VoseAlias +import random class oBTM: @@ -18,27 +19,27 @@ def __init__(self, num_topics, V, alpha=1., beta=0.01, l=0.5): self.beta = np.full((len(self.V), self.K), beta) self.l = l - def compute_corpus_acceptance(k_topic,proposal_topic): - doc_proposal_1 = ((n_z[k_topic] + self.alpha[k_topic])*(n_wz[b_i[0],k_topic] + self.beta[b_i[0],k_topic])*(n_wz[b_i[1],k_topic] + self.beta[b_i[0],k_topic]))/ + def compute_corpus_acceptance(self,n_z,n_wz,b_i,k_topic,proposal_topic): + doc_proposal_1 = ((n_z[k_topic] + self.alpha[k_topic])*(n_wz[b_i[0],k_topic] + self.beta[b_i[0],k_topic])*(n_wz[b_i[1],k_topic] + self.beta[b_i[0],k_topic]))/ \ (n_z[proposal_topic] + self.alpha[proposal_topic])*(n_wz[b_i[0],proposal_topic] + self.beta[b_i[0],proposal_topic])*(n_wz[b_i[1],proposal_topic] + self.beta[b_i[0],proposal_topic]) - doc_proposal_2 = ((2 * n_z[proposal_topic] + self.beta[proposal_topic].sum(axis=0))**2)*(n_z[proposal_topic]+1+self.alpha[proposal_topic])/ + doc_proposal_2 = ((2 * n_z[proposal_topic] + self.beta[proposal_topic].sum(axis=0))**2)*(n_z[proposal_topic]+1+self.alpha[proposal_topic])/ \ ((2 * n_z[k_topic] + self.beta[k_topic].sum(axis=0))**2)*(n_z[k_topic]+1+self.alpha[k_topic]) doc_proposal = doc_proposal_1*doc_proposal_2 return min(1,doc_proposal) - def compute_term_acceptance(wi,wj,s_topic,t_topic): - term_proposal_1 = (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + self.beta[wi,s_topic])**2)/ + def compute_term_acceptance(self,wi,wj,n_wz,n_z,s_topic,t_topic): + term_proposal_1 = (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + self.beta[wi,s_topic])**2)/\ (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + self.beta[wi,t_topic])**2) - term_proposal_2 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + self.beta[wi,t_topic])/ + term_proposal_2 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + self.beta[wi,t_topic])/\ (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + self.beta[wi,s_topic]) - term_proposal = term_proposal*term_proposal_1 + term_proposal = term_proposal_1*term_proposal_2 return min(1,term_proposal) def _gibbs(self, iterations): Z = np.zeros(len(self.B), dtype=np.int16) n_wz = np.zeros((len(self.V), self.K), dtype=int) n_z = np.zeros(self.K, dtype=int) - n_aw = np.zeros(self.V) + n_aw = np.zeros(len(self.V),dtype=object) for i, b_i in enumerate(self.B): topic = np.random.choice(self.K, 1)[0] @@ -46,37 +47,35 @@ def _gibbs(self, iterations): n_wz[b_i[1], topic] += 1 n_z[topic] += 1 Z[i] = topic - n_aw[b_i[0]] = VoseAlias(n_wz[b_i[0]]) - n_aw[b_i[1]] = VoseAlias(n_wz[b_i[1]]) + for index,item in enumerate(n_wz): + n_aw[index] = VoseAlias(item.tolist()) #create alias table for each word - - n_aw[b_i[0]] = VA.n_wz[b_i[0]] for _ in trange(iterations): for i, b_i in enumerate(self.B): n_wz[b_i[0], Z[i]] -= 1 n_wz[b_i[1], Z[i]] -= 1 n_z[Z[i]] -= 1 - proposal = np.random.randint(0,1) + proposal = np.random.randint(0,2) k_topic = Z[i] if proposal == 0: - index = randomInt(0, len(self.V)) + index = np.random.randint(0,self.K) proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1 - mh_acceptance = compute_corpus_acceptance(k_topic, proposal_topic) + mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic) else : proposal_topic = n_aw[b_i[0]].alias_generation() - mh_acceptance = compute_term_acceptance(b_i[0],b_i[1],k_topic,proposal_topic) - mh_sample = randomFloat(0, 1) + mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) + mh_sample = random.uniform(0, 1) if (mh_sample < mh_acceptance): # increment_count_matrices(d, w, k) // reject proposal, revert to k - Z[i] = k + Z[i] = k_topic n_wz[b_i[0], Z[i]] += 1 n_wz[b_i[1], Z[i]] += 1 n_z[Z[i]] += 1 else : # increment_count_matrices(d, w, p) // accept proposal - Z[i] = p + Z[i] = proposal_topic n_wz[b_i[0], Z[i]] += 1 n_wz[b_i[1], Z[i]] += 1 n_z[Z[i]] += 1 From e440374faa2ae66d2decc6e4150b9e0d40c4fad9 Mon Sep 17 00:00:00 2001 From: srk Date: Tue, 6 Aug 2019 14:04:34 +0530 Subject: [PATCH 3/3] variant of fast btm --- biterm/btm.py | 101 +++++++++++++++++++++++++++++++++++--------------- online_btm.py | 8 ++-- simple_btm.py | 6 +-- 3 files changed, 78 insertions(+), 37 deletions(-) diff --git a/biterm/btm.py b/biterm/btm.py index 18029a4..2fa90d2 100644 --- a/biterm/btm.py +++ b/biterm/btm.py @@ -3,7 +3,7 @@ from tqdm import trange from .vose_sampler import VoseAlias import random - +print ('fastbtm') class oBTM: """ Biterm Topic Model @@ -12,27 +12,43 @@ class oBTM: Thanks to jcapde for providing the code on https://github.com/jcapde/Biterm """ - def __init__(self, num_topics, V, alpha=1., beta=0.01, l=0.5): + def __init__(self, num_topics, V, alpha=1.0, beta=0.01, l=0.5): self.K = num_topics self.V = V self.alpha = np.full(self.K, alpha) self.beta = np.full((len(self.V), self.K), beta) self.l = l - def compute_corpus_acceptance(self,n_z,n_wz,b_i,k_topic,proposal_topic): - doc_proposal_1 = ((n_z[k_topic] + self.alpha[k_topic])*(n_wz[b_i[0],k_topic] + self.beta[b_i[0],k_topic])*(n_wz[b_i[1],k_topic] + self.beta[b_i[0],k_topic]))/ \ - (n_z[proposal_topic] + self.alpha[proposal_topic])*(n_wz[b_i[0],proposal_topic] + self.beta[b_i[0],proposal_topic])*(n_wz[b_i[1],proposal_topic] + self.beta[b_i[0],proposal_topic]) - doc_proposal_2 = ((2 * n_z[proposal_topic] + self.beta[proposal_topic].sum(axis=0))**2)*(n_z[proposal_topic]+1+self.alpha[proposal_topic])/ \ - ((2 * n_z[k_topic] + self.beta[k_topic].sum(axis=0))**2)*(n_z[k_topic]+1+self.alpha[k_topic]) - doc_proposal = doc_proposal_1*doc_proposal_2 + def compute_corpus_acceptance(self,n_z,n_wz,b_i,s_topic,t_topic): + doc_proposal_1 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[b_i[0],t_topic] + self.beta[b_i[0],t_topic])*(n_wz[b_i[1],t_topic] + self.beta[b_i[1],t_topic]) + doc_proposal_2 = (n_z[s_topic] + self.alpha[s_topic])*(n_wz[b_i[0],s_topic] + self.beta[b_i[0],s_topic])*(n_wz[b_i[1],s_topic] + self.beta[b_i[1],s_topic]) + doc_1 = doc_proposal_1/doc_proposal_2 + doc_proposal_3 = ((2 * n_z[s_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[s_topic]+1+self.alpha[s_topic]) + doc_proposal_4 = ((2 * n_z[t_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[t_topic]+1+self.alpha[t_topic]) + doc_2 = doc_proposal_3/doc_proposal_4 + doc_proposal = doc_1*doc_2 return min(1,doc_proposal) + def compute_term_proposal_1(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + (len(self.V)*self.beta[wi,s_topic]))**2) + + def compute_term_proposal_2(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + (len(self.V)*self.beta[wi,t_topic]))**2) + + def compute_term_proposal_3(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + (len(self.V)*self.beta[wi,t_topic])) + + def compute_term_proposal_4(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + (len(self.V)*self.beta[wi,s_topic])) + def compute_term_acceptance(self,wi,wj,n_wz,n_z,s_topic,t_topic): - term_proposal_1 = (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + self.beta[wi,s_topic])**2)/\ - (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + self.beta[wi,t_topic])**2) - term_proposal_2 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + self.beta[wi,t_topic])/\ - (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + self.beta[wi,s_topic]) - term_proposal = term_proposal_1*term_proposal_2 + t1 = self.compute_term_proposal_1(wi,wj,n_wz,n_z,s_topic,t_topic) + t2 = self.compute_term_proposal_2(wi,wj,n_wz,n_z,s_topic,t_topic) + t = t1/t2 + t3 = self.compute_term_proposal_3(wi,wj,n_wz,n_z,s_topic,t_topic) + t4 = self.compute_term_proposal_4(wi,wj,n_wz,n_z,s_topic,t_topic) + n = t3/t4 + term_proposal = t*n return min(1,term_proposal) def _gibbs(self, iterations): @@ -40,6 +56,7 @@ def _gibbs(self, iterations): n_wz = np.zeros((len(self.V), self.K), dtype=int) n_z = np.zeros(self.K, dtype=int) n_aw = np.zeros(len(self.V),dtype=object) + n_dw = np.zeros(len(self.B),dtype=int) for i, b_i in enumerate(self.B): topic = np.random.choice(self.K, 1)[0] @@ -47,6 +64,7 @@ def _gibbs(self, iterations): n_wz[b_i[1], topic] += 1 n_z[topic] += 1 Z[i] = topic + n_dw[i] = topic for index,item in enumerate(n_wz): n_aw[index] = VoseAlias(item.tolist()) @@ -59,27 +77,49 @@ def _gibbs(self, iterations): n_z[Z[i]] -= 1 proposal = np.random.randint(0,2) k_topic = Z[i] - if proposal == 0: + # index = np.random.randint(0,self.K) + # proposal_topic = n_dw[int(index)] + # if proposal == 0: + # mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic) + # else: + # proposal_topic = n_aw[b_i[0]].alias_generation() + # mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) + # mh_sample = random.uniform(0,1) + # if (mh_sample < mh_acceptance): + # Z[i] = k_topic + # n_wz[b_i[0], Z[i]] += 1 + # n_wz[b_i[1], Z[i]] += 1 + # n_z[Z[i]] += 1 + # else: + # Z[i] = proposal_topic + # n_wz[b_i[0], Z[i]] += 1 + # n_wz[b_i[1], Z[i]] += 1 + # n_z[Z[i]] += 1 + for mh_step in range(1,2): index = np.random.randint(0,self.K) - proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1 + proposal_topic = n_dw[int(index)] + # proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1 mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic) - else : + mh_sample = random.uniform(0,1) + if (mh_sample < mh_acceptance): + k_topic = proposal_topic proposal_topic = n_aw[b_i[0]].alias_generation() mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) - mh_sample = random.uniform(0, 1) - if (mh_sample < mh_acceptance): - # increment_count_matrices(d, w, k) // reject proposal, revert to k - Z[i] = k_topic - n_wz[b_i[0], Z[i]] += 1 - n_wz[b_i[1], Z[i]] += 1 - n_z[Z[i]] += 1 - else : - # increment_count_matrices(d, w, p) // accept proposal - Z[i] = proposal_topic - n_wz[b_i[0], Z[i]] += 1 - n_wz[b_i[1], Z[i]] += 1 - n_z[Z[i]] += 1 - + mh_sample = random.uniform(0,1) + if (mh_sample < mh_acceptance): + k_topic = proposal_topic + proposal_topic = n_aw[b_i[1]].alias_generation() + mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) + mh_sample = random.uniform(0,1) + if (mh_sample < mh_acceptance): + k_topic = proposal_topic + # n_aw[b_i[0]] = VoseAlias(n_wz[b_i[0]].tolist()) + # n_aw[b_i[1]] = VoseAlias(n_wz[b_i[1]].tolist()) + Z[i] = k_topic + n_wz[b_i[0], Z[i]] += 1 + n_wz[b_i[1], Z[i]] += 1 + n_z[Z[i]] += 1 + # return n_z, n_wz def fit_transform(self, B_d, iterations): @@ -87,6 +127,7 @@ def fit_transform(self, B_d, iterations): return self.transform(B_d) def fit(self, B_d, iterations): + print ("fastbtm") self.B = list(chain(*B_d)) n_z, self.nwz = self._gibbs(iterations) diff --git a/online_btm.py b/online_btm.py index 4796d04..9b3ebbb 100644 --- a/online_btm.py +++ b/online_btm.py @@ -25,12 +25,12 @@ print("\n\n Train Online BTM ..") for i in range(0, len(biterms), 100): # prozess chunk of 200 texts biterms_chunk = biterms[i:i + 100] - btm.fit(biterms_chunk, iterations=50) + btm.fit(biterms_chunk, iterations=100) topics = btm.transform(biterms) - print("\n\n Visualize Topics ..") - vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) - pyLDAvis.save_html(vis, './vis/online_btm.html') + # print("\n\n Visualize Topics ..") + # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) + # pyLDAvis.save_html(vis, './vis/online_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10) diff --git a/simple_btm.py b/simple_btm.py index 3e30532..a374607 100644 --- a/simple_btm.py +++ b/simple_btm.py @@ -24,9 +24,9 @@ print("\n\n Train BTM ..") topics = btm.fit_transform(biterms, iterations=100) - print("\n\n Visualize Topics ..") - vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) - pyLDAvis.save_html(vis, './vis/simple_btm.html') + # print("\n\n Visualize Topics ..") + # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) + # pyLDAvis.save_html(vis, './vis/simple_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10)