-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvis.py
100 lines (68 loc) · 2.83 KB
/
vis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
""" Utilities to integrate with pyLDAvis
Most of the code lifted from pyLDAvis.sklearn, only difference is how we extract the vocabulary.
"""
import funcy as fp
import pyLDAvis as lda
def _get_doc_lengths(dtm):
return dtm.sum(axis=1).getA1()
def _get_term_freqs(dtm):
return dtm.sum(axis=0).getA1()
def _row_norm(dists):
# row normalization function required
# for doc_topic_dists and topic_term_dists
return dists / dists.sum(axis=1)[:, None]
def _get_doc_topic_dists(lda_model, dtm):
return _row_norm(lda_model.transform(dtm))
def _get_topic_term_dists(lda_model):
return _row_norm(lda_model.components_)
def _get_vocab(id2term):
res = []
for i in sorted(id2term.keys()):
res.append(id2term[i])
return res
def _extract_data(lda_model, dtm, id2term):
vocab = _get_vocab(id2term)
doc_lengths = _get_doc_lengths(dtm)
term_freqs = _get_term_freqs(dtm)
topic_term_dists = _get_topic_term_dists(lda_model)
assert term_freqs.shape[0] == len(vocab), \
('Term frequencies and vocabulary are of different sizes, {} != {}.'
.format(term_freqs.shape[0], len(vocab)))
assert topic_term_dists.shape[1] == dtm.shape[1], \
('Topic-term distributions and document-term matrix have different '
'number of columns, {} != {}.'
.format(topic_term_dists.shape[1], len(vocab)))
# column dimensions of document-term matrix and topic-term distributions
# must match first before transforming to document-topic distributions
doc_topic_dists = _get_doc_topic_dists(lda_model, dtm)
return {'vocab': vocab,
'doc_lengths': doc_lengths.tolist(),
'term_frequency': term_freqs.tolist(),
'doc_topic_dists': doc_topic_dists.tolist(),
'topic_term_dists': topic_term_dists.tolist()}
def prepare(lda_model, dtm, id2term, **kwargs):
"""Create Prepared Data from sklearn's LatentDirichletAllocation and
CountVectorizer.
Parameters
----------
lda_model : sklearn.decomposition.LatentDirichletAllocation.
Latent Dirichlet Allocation model from sklearn fitted with `dtm`
dtm : array-like or sparse matrix, shape=(n_samples, n_features)
Document-term matrix used to fit on LatentDirichletAllocation model
(`lda_model`)
id2term: the <feature id>:<term word> dictionary
**kwargs: Keyword argument to be passed to pyLDAvis.prepare()
Returns
-------
prepared_data : PreparedData
the data structures used in the visualization
Example
--------
For example usage please see this notebook:
http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb
See
------
See `pyLDAvis.prepare` for **kwargs.
"""
opts = fp.merge(_extract_data(lda_model, dtm, id2term), kwargs)
return lda.prepare(**opts)