diff --git a/biterm/btm.py b/biterm/btm.py index af548e9..2fa90d2 100644 --- a/biterm/btm.py +++ b/biterm/btm.py @@ -1,7 +1,9 @@ import numpy as np from itertools import combinations, chain from tqdm import trange - +from .vose_sampler import VoseAlias +import random +print ('fastbtm') class oBTM: """ Biterm Topic Model @@ -10,19 +12,51 @@ class oBTM: Thanks to jcapde for providing the code on https://github.com/jcapde/Biterm """ - def __init__(self, num_topics, V, alpha=1., beta=0.01, l=0.5): + def __init__(self, num_topics, V, alpha=1.0, beta=0.01, l=0.5): self.K = num_topics self.V = V self.alpha = np.full(self.K, alpha) self.beta = np.full((len(self.V), self.K), beta) self.l = l + def compute_corpus_acceptance(self,n_z,n_wz,b_i,s_topic,t_topic): + doc_proposal_1 = (n_z[t_topic] + self.alpha[t_topic])*(n_wz[b_i[0],t_topic] + self.beta[b_i[0],t_topic])*(n_wz[b_i[1],t_topic] + self.beta[b_i[1],t_topic]) + doc_proposal_2 = (n_z[s_topic] + self.alpha[s_topic])*(n_wz[b_i[0],s_topic] + self.beta[b_i[0],s_topic])*(n_wz[b_i[1],s_topic] + self.beta[b_i[1],s_topic]) + doc_1 = doc_proposal_1/doc_proposal_2 + doc_proposal_3 = ((2 * n_z[s_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[s_topic]+1+self.alpha[s_topic]) + doc_proposal_4 = ((2 * n_z[t_topic] + (len(self.V)*self.beta[b_i[0],s_topic]))**2)*(n_z[t_topic]+1+self.alpha[t_topic]) + doc_2 = doc_proposal_3/doc_proposal_4 + doc_proposal = doc_1*doc_2 + return min(1,doc_proposal) + + def compute_term_proposal_1(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_wz[wi,t_topic] + self.beta[wi,t_topic])*(n_wz[wj,t_topic] + self.beta[wj,t_topic])*((2 * n_z[s_topic] + (len(self.V)*self.beta[wi,s_topic]))**2) + + def compute_term_proposal_2(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_wz[wi,s_topic] + self.beta[wi,s_topic])*(n_wz[wj,s_topic] + self.beta[wj,s_topic])*((2 * n_z[t_topic] + (len(self.V)*self.beta[wi,t_topic]))**2) + + def compute_term_proposal_3(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_z[t_topic] + self.alpha[t_topic])*(n_wz[wi,s_topic] + self.beta[wi,s_topic])*(2 * n_z[t_topic] + 1 + (len(self.V)*self.beta[wi,t_topic])) + + def compute_term_proposal_4(self,wi,wj,n_wz,n_z,s_topic,t_topic): + return (n_z[s_topic] + self.alpha[s_topic])*(n_wz[wi,t_topic] + self.beta[wi,t_topic])*(2 * n_z[s_topic] + 1 + (len(self.V)*self.beta[wi,s_topic])) + + def compute_term_acceptance(self,wi,wj,n_wz,n_z,s_topic,t_topic): + t1 = self.compute_term_proposal_1(wi,wj,n_wz,n_z,s_topic,t_topic) + t2 = self.compute_term_proposal_2(wi,wj,n_wz,n_z,s_topic,t_topic) + t = t1/t2 + t3 = self.compute_term_proposal_3(wi,wj,n_wz,n_z,s_topic,t_topic) + t4 = self.compute_term_proposal_4(wi,wj,n_wz,n_z,s_topic,t_topic) + n = t3/t4 + term_proposal = t*n + return min(1,term_proposal) def _gibbs(self, iterations): - Z = np.zeros(len(self.B), dtype=np.int16) n_wz = np.zeros((len(self.V), self.K), dtype=int) n_z = np.zeros(self.K, dtype=int) + n_aw = np.zeros(len(self.V),dtype=object) + n_dw = np.zeros(len(self.B),dtype=int) for i, b_i in enumerate(self.B): topic = np.random.choice(self.K, 1)[0] @@ -30,24 +64,62 @@ def _gibbs(self, iterations): n_wz[b_i[1], topic] += 1 n_z[topic] += 1 Z[i] = topic + n_dw[i] = topic + + for index,item in enumerate(n_wz): + n_aw[index] = VoseAlias(item.tolist()) + #create alias table for each word for _ in trange(iterations): for i, b_i in enumerate(self.B): n_wz[b_i[0], Z[i]] -= 1 n_wz[b_i[1], Z[i]] -= 1 n_z[Z[i]] -= 1 - P_w0z = (n_wz[b_i[0], :] + self.beta[b_i[0], :]) / (2 * n_z + self.beta.sum(axis=0)) - P_w1z = (n_wz[b_i[1], :] + self.beta[b_i[1], :]) / (2 * n_z + 1 + self.beta.sum(axis=0)) - P_z = (n_z + self.alpha) * P_w0z * P_w1z - # P_z = (n_z + self.alpha) * ((n_wz[b_i[0], :] + self.beta[b_i[0], :]) * (n_wz[b_i[1], :] + self.beta[b_i[1], :]) / - # (((n_wz + self.beta).sum(axis=0) + 1) * (n_wz + self.beta).sum(axis=0))) # todo check out - P_z = P_z / P_z.sum() - Z[i] = np.random.choice(self.K, 1, p=P_z) + proposal = np.random.randint(0,2) + k_topic = Z[i] + # index = np.random.randint(0,self.K) + # proposal_topic = n_dw[int(index)] + # if proposal == 0: + # mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic) + # else: + # proposal_topic = n_aw[b_i[0]].alias_generation() + # mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) + # mh_sample = random.uniform(0,1) + # if (mh_sample < mh_acceptance): + # Z[i] = k_topic + # n_wz[b_i[0], Z[i]] += 1 + # n_wz[b_i[1], Z[i]] += 1 + # n_z[Z[i]] += 1 + # else: + # Z[i] = proposal_topic + # n_wz[b_i[0], Z[i]] += 1 + # n_wz[b_i[1], Z[i]] += 1 + # n_z[Z[i]] += 1 + for mh_step in range(1,2): + index = np.random.randint(0,self.K) + proposal_topic = n_dw[int(index)] + # proposal_topic = n_wz[b_i[0],index] #doesnt matter which biterm[0] or 1 + mh_acceptance = self.compute_corpus_acceptance(n_z,n_wz,b_i,k_topic, proposal_topic) + mh_sample = random.uniform(0,1) + if (mh_sample < mh_acceptance): + k_topic = proposal_topic + proposal_topic = n_aw[b_i[0]].alias_generation() + mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) + mh_sample = random.uniform(0,1) + if (mh_sample < mh_acceptance): + k_topic = proposal_topic + proposal_topic = n_aw[b_i[1]].alias_generation() + mh_acceptance = self.compute_term_acceptance(b_i[0],b_i[1],n_wz,n_z,k_topic,proposal_topic) + mh_sample = random.uniform(0,1) + if (mh_sample < mh_acceptance): + k_topic = proposal_topic + # n_aw[b_i[0]] = VoseAlias(n_wz[b_i[0]].tolist()) + # n_aw[b_i[1]] = VoseAlias(n_wz[b_i[1]].tolist()) + Z[i] = k_topic n_wz[b_i[0], Z[i]] += 1 n_wz[b_i[1], Z[i]] += 1 n_z[Z[i]] += 1 - - + # return n_z, n_wz def fit_transform(self, B_d, iterations): @@ -55,6 +127,7 @@ def fit_transform(self, B_d, iterations): return self.transform(B_d) def fit(self, B_d, iterations): + print ("fastbtm") self.B = list(chain(*B_d)) n_z, self.nwz = self._gibbs(iterations) diff --git a/biterm/vose_sampler.py b/biterm/vose_sampler.py new file mode 100644 index 0000000..9d002e1 --- /dev/null +++ b/biterm/vose_sampler.py @@ -0,0 +1,92 @@ +#!/usr/bin/python + +#LIBRARIES: +# Standard library +import os +import random +import re +import sys +from decimal import * +from optparse import OptionParser +#import cProfile + +# Third-party libraries (only used temporarily for profiling) +#import memory_profiler + + +class VoseAlias(object): + """ A probability distribution for discrete weighted random variables and its probability/alias + tables for efficient sampling via Vose's Alias Method (a good explanation of which can be found at + http://www.keithschwarz.com/darts-dice-coins/). + """ + + def __init__(self, probabilities): + """ (VoseAlias, dict) -> NoneType """ + self.probabilities = probabilities + self.alias_initialisation() + self.table_prob_list = list(self.table_prob) + + def alias_initialisation(self): + """ Construct probability and alias tables for the distribution. """ + # Initialise variables + n = len(self.probabilities) + self.table_prob = {} # probability table + self.table_alias = {} # alias table + scaled_prob = {} # scaled probabilities + small = [] # stack for probabilities smaller that 1 + large = [] # stack for probabilities greater than or equal to 1 + + # Construct and sort the scaled probabilities into their appropriate stacks + for o, p in enumerate(self.probabilities): + scaled_prob[o] = Decimal(p) * n + + if scaled_prob[o] < 1: + small.append(o) + else: + large.append(o) + + # Construct the probability and alias tables + while small and large: + s = small.pop() + l = large.pop() + + self.table_prob[s] = scaled_prob[s] + self.table_alias[s] = l + + scaled_prob[l] = (scaled_prob[l] + scaled_prob[s]) - Decimal(1) + + if scaled_prob[l] < 1: + small.append(l) + else: + large.append(l) + + # The remaining outcomes (of one stack) must have probability 1 + while large: + self.table_prob[large.pop()] = Decimal(1) + + while small: + self.table_prob[small.pop()] = Decimal(1) + + def alias_generation(self): + """ Return a random outcome from the distribution. """ + # Determine which column of table_prob to inspect + col = random.choice(self.table_prob_list) + + # Determine which outcome to pick in that column + if self.table_prob[col] >= random.uniform(0,1): + return col + else: + return self.table_alias[col] + +def main(): + try: + test = VoseAlias([0.1,0.2,0.8]) + import pdb + pdb.set_trace() + print (test.alias_generation()) + except Exception as e: + sys.exit("\nError: %s" % e) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/online_btm.py b/online_btm.py index 4796d04..9b3ebbb 100644 --- a/online_btm.py +++ b/online_btm.py @@ -25,12 +25,12 @@ print("\n\n Train Online BTM ..") for i in range(0, len(biterms), 100): # prozess chunk of 200 texts biterms_chunk = biterms[i:i + 100] - btm.fit(biterms_chunk, iterations=50) + btm.fit(biterms_chunk, iterations=100) topics = btm.transform(biterms) - print("\n\n Visualize Topics ..") - vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) - pyLDAvis.save_html(vis, './vis/online_btm.html') + # print("\n\n Visualize Topics ..") + # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) + # pyLDAvis.save_html(vis, './vis/online_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10) diff --git a/simple_btm.py b/simple_btm.py index 3e30532..a374607 100644 --- a/simple_btm.py +++ b/simple_btm.py @@ -24,9 +24,9 @@ print("\n\n Train BTM ..") topics = btm.fit_transform(biterms, iterations=100) - print("\n\n Visualize Topics ..") - vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) - pyLDAvis.save_html(vis, './vis/simple_btm.html') + # print("\n\n Visualize Topics ..") + # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) + # pyLDAvis.save_html(vis, './vis/simple_btm.html') print("\n\n Topic coherence ..") topic_summuary(btm.phi_wz.T, X, vocab, 10)