Source code for qrmine.visualize

"""
Copyright (C) 2025 Bell Eapen

This file is part of qrmine.

qrmine is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

qrmine is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with qrmine.  If not, see <https://www.gnu.org/licenses/>.
"""

from collections import Counter

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.patches import Rectangle
from matplotlib.ticker import FuncFormatter
from sklearn.manifold import TSNE
from wordcloud import STOPWORDS, WordCloud



[docs]
class QRVisualize:
    def __init__(self, data: pd.DataFrame = None):
        """
        Initialize the QRVisualize class with a DataFrame.

        Parameters:
        data (pd.DataFrame): The DataFrame containing the data to visualize.
        """
        self.data = data


[docs]
    def plot_frequency_distribution_of_words(self, df=None, folder_path=None):
        if df is None:
            df = self.data
        doc_lens = [len(d) for d in df.Text]

        # Plot
        plt.figure(figsize=(16, 7), dpi=160)
        plt.hist(doc_lens, bins=1000, color="navy")
        plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
        plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
        plt.text(750, 80, "Stdev   : " + str(round(np.std(doc_lens))))
        plt.text(750, 70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
        plt.text(750, 60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

        plt.gca().set(
            xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count"
        )
        plt.tick_params(size=16)
        plt.xticks(np.linspace(0, 1000, 9))
        plt.title("Distribution of Document Word Counts", fontdict=dict(size=22))
        plt.show(block=False)
        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()



[docs]
    def plot_distribution_by_topic(self, df=None, folder_path=None):
        if df is None:
            df = self.data
        # Plot
        cols = [
            color for name, color in mcolors.TABLEAU_COLORS.items()
        ]  # more colors: 'mcolors.XKCD_COLORS'

        fig, axes = plt.subplots(
            2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True
        )

        for i, ax in enumerate(axes.flatten()):
            df_dominant_topic_sub = df.loc[df.Dominant_Topic == i, :]
            doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
            ax.hist(doc_lens, bins=1000, color=cols[i])
            ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i])
            sns.kdeplot(
                doc_lens, color="black", fill=False, ax=ax.twinx(), warn_singular=False
            )
            ax.set(xlim=(0, 1000), xlabel="Document Word Count")
            ax.set_ylabel("Number of Documents", color=cols[i])
            ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i]))

        fig.tight_layout()
        fig.subplots_adjust(top=0.90)
        plt.xticks(np.linspace(0, 1000, 9))
        fig.suptitle(
            "Distribution of Document Word Counts by Dominant Topic", fontsize=22
        )
        plt.show(block=False)
        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()



[docs]
    def plot_wordcloud(self, topics=None, folder_path=None):
        cols = [
            color for name, color in mcolors.TABLEAU_COLORS.items()
        ]  # more colors: 'mcolors.XKCD_COLORS'

        cloud = WordCloud(
            stopwords=STOPWORDS,
            background_color="white",
            width=250,
            height=180,
            max_words=5,
            colormap="tab10",
            color_func=lambda *args, **kwargs: cols[i],
            prefer_horizontal=1.0,
        )

        fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)

        for i, ax in enumerate(axes.flatten()):
            fig.add_subplot(ax)
            topic_words = dict(topics[i][1])
            cloud.generate_from_frequencies(topic_words, max_font_size=300)
            plt.gca().imshow(cloud)
            plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16))
            plt.gca().axis("off")

        plt.subplots_adjust(wspace=0, hspace=0)
        plt.axis("off")
        plt.margins(x=0, y=0)
        plt.tight_layout()
        plt.show(block=False)
        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()



[docs]
    def plot_importance(self, topics=None, processed_docs=None, folder_path=None):
        data_flat = [w for w_list in processed_docs for w in w_list]
        counter = Counter(data_flat)

        out = []
        for i, topic in topics:
            for word, weight in topic:
                out.append([word, i, weight, counter[word]])

        df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"])

        # Plot Word Count and Weights of Topic Keywords
        fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)
        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
        for i, ax in enumerate(axes.flatten()):
            ax.bar(
                x="word",
                height="word_count",
                data=df.loc[df.topic_id == i, :],
                color=cols[i],
                width=0.5,
                alpha=0.3,
                label="Word Count",
            )
            ax_twin = ax.twinx()
            ax_twin.bar(
                x="word",
                height="importance",
                data=df.loc[df.topic_id == i, :],
                color=cols[i],
                width=0.2,
                label="Weights",
            )
            ax.set_ylabel("Word Count", color=cols[i])
            ax_twin.set_ylim(0, 0.030)
            ax.set_ylim(0, 3500)
            ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16)
            ax.tick_params(axis="y", left=False)
            ax.set_xticklabels(
                df.loc[df.topic_id == i, "word"],
                rotation=30,
                horizontalalignment="right",
            )
            ax.legend(loc="upper left")
            ax_twin.legend(loc="upper right")

        fig.tight_layout(w_pad=2)
        fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05)
        plt.show(block=False)
        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()



[docs]
    def sentence_chart(self, lda_model=None, corpus=None, start=0, end=13, folder_path=None):
        if lda_model is None:
            raise ValueError("LDA model is not provided.")
        corp = corpus[start:end]
        mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]

        fig, axes = plt.subplots(
            end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160
        )
        axes[0].axis("off")
        for i, ax in enumerate(axes):
            try:
                if i > 0:
                    corp_cur = corp[i - 1]
                    topic_percs, wordid_topics, _ = lda_model[corp_cur]
                    word_dominanttopic = [
                        (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics
                    ]
                    ax.text(
                        0.01,
                        0.5,
                        "Doc " + str(i - 1) + ": ",
                        verticalalignment="center",
                        fontsize=16,
                        color="black",
                        transform=ax.transAxes,
                        fontweight=700,
                    )

                    # Draw Rectange
                    topic_percs_sorted = sorted(
                        topic_percs, key=lambda x: (x[1]), reverse=True
                    )
                    ax.add_patch(
                        Rectangle(
                            (0.0, 0.05),
                            0.99,
                            0.90,
                            fill=None,
                            alpha=1,
                            color=mycolors[topic_percs_sorted[0][0]],
                            linewidth=2,
                        )
                    )

                    word_pos = 0.06
                    for j, (word, topics) in enumerate(word_dominanttopic):
                        if j < 14:
                            ax.text(
                                word_pos,
                                0.5,
                                word,
                                horizontalalignment="left",
                                verticalalignment="center",
                                fontsize=16,
                                color=mycolors[topics],
                                transform=ax.transAxes,
                                fontweight=700,
                            )
                            word_pos += 0.009 * len(
                                word
                            )  # to move the word for the next iter
                            ax.axis("off")
                    ax.text(
                        word_pos,
                        0.5,
                        ". . .",
                        horizontalalignment="left",
                        verticalalignment="center",
                        fontsize=16,
                        color="black",
                        transform=ax.transAxes,
                    )
            except:
                continue

        plt.subplots_adjust(wspace=0, hspace=0)
        plt.suptitle(
            "Sentence Topic Coloring for Documents: "
            + str(start)
            + " to "
            + str(end - 2),
            fontsize=22,
            y=0.95,
            fontweight=700,
        )
        plt.tight_layout()
        plt.show(block=False)
        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()


    def _cluster_chart(self, lda_model=None, corpus=None, n_topics=3, folder_path=None):
        # Get topic weights
        topic_weights = []
        for i, row_list in enumerate(lda_model[corpus]):
            topic_weights.append([w for i, w in row_list[0]])

        # Array of topic weights
        arr = pd.DataFrame(topic_weights).fillna(0).values

        # Keep the well separated points (optional)
        arr = arr[np.amax(arr, axis=1) > 0.35]

        # Dominant topic number in each doc
        topic_num = np.argmax(arr, axis=1)

        # tSNE Dimension Reduction
        tsne_model = TSNE(
            n_components=2, verbose=1, random_state=0, angle=0.99, init="pca"
        )
        tsne_lda = tsne_model.fit_transform(arr)

        # Plot
        plt.figure(figsize=(16, 10), dpi=160)
        for i in range(n_topics):
            plt.scatter(
                tsne_lda[topic_num == i, 0],
                tsne_lda[topic_num == i, 1],
                label=str(i),
                alpha=0.5,
            )
        plt.title("t-SNE Clustering of Topics", fontsize=22)
        plt.xlabel("t-SNE Dimension 1", fontsize=16)
        plt.ylabel("t-SNE Dimension 2", fontsize=16)
        plt.legend(title="Topic Number", loc="upper right")
        plt.show(block=False)
        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()


[docs]
    def most_discussed_topics(
        self, lda_model, dominant_topics, topic_percentages, folder_path=None
    ):

        # Distribution of Dominant Topics in Each Document
        df = pd.DataFrame(dominant_topics, columns=["Document_Id", "Dominant_Topic"])
        dominant_topic_in_each_doc = df.groupby("Dominant_Topic").size()
        df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(
            name="count"
        ).reset_index()

        # Total Topic Distribution by actual weight
        topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
        df_topic_weightage_by_doc = (
            topic_weightage_by_doc.sum().to_frame(name="count").reset_index()
        )

        # Top 3 Keywords for each Topic
        topic_top3words = [
            (i, topic)
            for i, topics in lda_model.show_topics(formatted=False)
            for j, (topic, wt) in enumerate(topics)
            if j < 3
        ]

        df_top3words_stacked = pd.DataFrame(
            topic_top3words, columns=["topic_id", "words"]
        )
        df_top3words = df_top3words_stacked.groupby("topic_id").agg(", \n".join)
        df_top3words.reset_index(level=0, inplace=True)

        # Plot
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)

        # Topic Distribution by Dominant Topics
        ax1.bar(
            x="Dominant_Topic",
            height="count",
            data=df_dominant_topic_in_each_doc,
            width=0.5,
            color="firebrick",
        )
        ax1.set_xticks(
            range(df_dominant_topic_in_each_doc.Dominant_Topic.unique().__len__())
        )
        tick_formatter = FuncFormatter(
            lambda x, pos: "Topic "
            + str(x)
            + "\n"
            + df_top3words.loc[df_top3words.topic_id == x, "words"].values[0]
        )
        ax1.xaxis.set_major_formatter(tick_formatter)
        ax1.set_title("Number of Documents by Dominant Topic", fontdict=dict(size=10))
        ax1.set_ylabel("Number of Documents")
        ax1.set_ylim(0, 1000)

        # Topic Distribution by Topic Weights
        ax2.bar(
            x="index",
            height="count",
            data=df_topic_weightage_by_doc,
            width=0.5,
            color="steelblue",
        )
        ax2.set_xticks(range(df_topic_weightage_by_doc.index.unique().__len__()))
        ax2.xaxis.set_major_formatter(tick_formatter)
        ax2.set_title("Number of Documents by Topic Weightage", fontdict=dict(size=10))

        plt.show(block=False)

        # save
        if folder_path:
            plt.savefig(folder_path)
            plt.close()



[docs]
    def update_annot(self, ind):
        norm = plt.Normalize(1,4)
        cmap = plt.cm.RdYlGn
        pos = self.sc.get_offsets()[ind["ind"][0]]
        self.annot.xy = pos
        text = "{}, {}".format(
            " ".join(list(map(str, ind["ind"]))), " ".join([self.names[n] for n in ind["ind"]])
        )
        self.annot.set_text(text)
        self.annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]])))
        self.annot.get_bbox_patch().set_alpha(0.4)



[docs]
    def hover(self, event):
        vis = self.annot.get_visible()
        if event.inaxes == self.ax:
            cont, ind = self.sc.contains(event)
            if cont:
                self.update_annot(ind)
                self.annot.set_visible(True)
                self.fig.canvas.draw_idle()
            else:
                if vis:
                    self.annot.set_visible(False)
                    self.fig.canvas.draw_idle()


    # https://stackoverflow.com/questions/7908636/how-to-add-hovering-annotations-to-a-plot

[docs]
    def cluster_chart (self, data, folder_path=None):
        # Scatter plot for Text Cluster Prediction
        plt.figure(figsize=(6, 6))
        self.fig, self.ax = plt.subplots()
        self.names = data['title']
        self.sc = plt.scatter(data['x'], data['y'], c=data['colour'], s=36, edgecolors='black', linewidths=0.75)
        self.annot = self.ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
                    bbox=dict(boxstyle="round", fc="w"),
                    arrowprops=dict(arrowstyle="->"))
        self.annot.set_visible(False)
        plt.title('Text Cluster Prediction')
        plt.axis('off')  # Optional: Remove axes for a cleaner look
        plt.colorbar(self.sc, label='Colour')  # Add colorbar if needed
        self.fig.canvas.mpl_connect("motion_notify_event", self.hover)
        plt.show(block=False)
        # save
        if folder_path:
            # annotate with data['title']
            for i, txt in enumerate(data['title']):
                plt.annotate(txt, (data['x'][i], data['y'][i]), fontsize=8, ha='right', va='bottom')
            plt.savefig(folder_path)
            plt.close()
Source code for qrmine.visualize

qrmine

Navigation

Related Topics