-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgraphlab.py
More file actions
78 lines (57 loc) · 2.39 KB
/
graphlab.py
File metadata and controls
78 lines (57 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
pyLDAvis GraphLab
===============
Helper functions to visualize GraphLab Create's TopicModel (an implementation of LDA)
"""
from __future__ import absolute_import
import funcy as fp
import numpy as np
import pandas as pd
import graphlab as gl
import pyLDAvis
def _topics_as_df(topic_model):
tdf = topic_model['topics'].to_dataframe()
return pd.DataFrame(np.vstack(tdf['topic_probabilities'].values), index=tdf['vocabulary'])
def _sum_sarray_dicts(sarray):
counts_sf = gl.SFrame({
'count_dicts': sarray}).stack('count_dicts').groupby(
key_columns='X1',
operations={'count': gl.aggregate.SUM('X2')})
return counts_sf.unstack(column=['X1', 'count'])[0].values()[0]
def _extract_doc_data(docs):
doc_lengths = list(docs.apply(lambda d: np.array(d.values()).sum()))
term_freqs_dict = _sum_sarray_dicts(docs)
vocab = term_freqs_dict.keys()
term_freqs = term_freqs_dict.values()
return {'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
def _extract_model_data(topic_model, docs, vocab):
doc_topic_dists = np.vstack(topic_model.predict(docs, output_type='probabilities'))
topics = _topics_as_df(topic_model)
topic_term_dists = topics.T[vocab].values
return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists}
def _extract_data(topic_model, docs):
doc_data = _extract_doc_data(docs)
model_data = _extract_model_data(topic_model, docs, doc_data['vocab'])
return fp.merge(doc_data, model_data)
def prepare(topic_model, docs, **kargs):
"""Transforms the GraphLab TopicModel and related corpus data into
the data structures needed for the visualization.
Parameters
----------
topic_model : graphlab.toolkits.topic_model.topic_model.TopicModel
An already trained GraphLab topic model.
docs : SArray of dicts
The corpus in bag of word form, the same docs used to train the model.
**kwargs :
additional keyword arguments are passed through to :func:`pyldavis.prepare`.
Returns
-------
prepared_data : PreparedData
the data structures used in the visualization
Example
--------
For example usage please see this notebook:
http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/GraphLab.ipynb
"""
opts = fp.merge(_extract_data(topic_model, docs), kargs)
return pyLDAvis.prepare(**opts)