master_thesis/topic_analysis.py at master · tyevhen/master_thesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# from data_helper import *
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb


def get_doc_top_topics_probs(topic2doc, n):

    """

    :param topic2doc: Topic-to-document matrix
    :param n: Top topic probs to be extracted
    :return: Top topics dictionary; key: document index, value: array of topic numbers
    """
    top_topics_dict = []
    for idx in range(topic2doc.shape[0]):
        topic_idxs = np.argpartition(topic2doc[idx], -n)[-n:]
        # topic_probs = topic2doc[idx][topic_idxs]
        # top_topics_dict[idx] = topic_idxs
        top_topics_dict.append(topic_idxs.item(0))
    return top_topics_dict


def get_topic_labels(topic2doc):
    """

    :param topic2doc: Topic-to-document matrix
    :return: List of topic labels
    """
    labels = []
    for n in range(topic2doc.shape[0]):
        topic_most_prob = topic2doc[n].argmax()
        print("doc: {} topic: {}\n".format(n, topic_most_prob))
        labels.append(topic_most_prob)
    return labels

def get_topic_top_words(model, features_vocab, no_words, topic_id):
    """

    :param model: Model instance
    :param features_vocab: Model features
    :param no_words: Number of words to be extracted
    :param topic_id: Topic id number
    :return:
    """
    word_topic_matrix = model.components_
    print("\nTopic #{}:".format(topic_id))
    extracted_words = " ".join([features_vocab[i] for i in word_topic_matrix[topic_id].argsort()[:-no_words-1:-1]])
    return extracted_words


def plot_heatmap(matrix, labels):
    """

    :param matrix:  Data distribution matrix
    :param labels: Topic labels array
    :return:
    """
    row_lbls = ["doc " + str(i) for i in range(matrix.shape[0])]
    # column_lbls = ["topic " + str(i) for i in range(matrix.shape[1])]
    df = pd.DataFrame(data=matrix,
                      index=row_lbls,
                      columns=labels)
    sb.set()
    sb.heatmap(df)
    plt.show()