Source code for berttopic_utils

""" This module provides functionality to create topic modelling models and visualizations.
"""

from nltk.corpus import stopwords
from generate_utils import filter_by_topic
from bertopic import BERTopic
import json
import re


[docs]def remove_stopwords(texts, stop_words):
    """
    Given a list of sentences, it removes the stop_words from each sentence.

    :param texts: List of sentences (strings)
    :param stop_words: List of words (strings) to remove.
    :return: List of sentences (strings) without the stop words.
    """
    list_texts = []
    for text in texts:
        words = text.split(" ")
        final_str = ""
        for word in words:
            if word not in stop_words and len(word) > 3:
                final_str += word + " "
        final_str = final_str.strip()
        list_texts.append(final_str)
    return list_texts


[docs]def get_cleaned_documents(df_original):
    """
    Given a df with a column Texto (tweets), this function preprocess the texts of that column removing punctuation, common
    words, stop words and urls.

    :param df_original: A df as with the tweets.
    :return: A list with the tweets (strings).
    """
    df = df_original.copy()
    df["Texto"] = df["Texto"].apply(str)
    # df["Texto"] = df["Texto"].map(lambda x: re.sub("[,\.!?#]^¿¡","",x))
    df['Texto'] = df['Texto'].str.replace(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    df["Texto"] = df["Texto"].str.replace('[^\w\s]', '')
    df["Texto"] = df["Texto"].map(lambda x: x.lower())
    df["Texto"] = df["Texto"].map(lambda x: re.sub("climate change", "climatechange", x))
    stop_words = stopwords.words("english")
    stop_words.extend(stopwords.words("spanish"))
    stop_words.extend(
        ["from", "citizenscience", "citizen science", "citizen", "science", "ciencia", "need", "thank", "project",
         "projects"])
    documents = df["Texto"].values.tolist()
    documents = remove_stopwords(documents, stop_words)
    return documents


[docs]def create_bert_model(documents):
    """
    Given a list of sentences, it creates a BERTopic model with them.

    :param documents: List of sentences (strings).
    :return: The BERTopic model, the topics and the probabilities.
    """
    model = BERTopic(verbose=True, language="multilingual", min_topic_size=100)
    topics, probs = model.fit_transform(documents)
    return model, topics, probs


[docs]def get_intertopic_distance(model, top_n_topics=20):
    """
    Wrapper to create the intertopic distance visualization

    :param model: The BERTopic model.
    :param top_n_topics: The number of topics to show.
    :return: The intertopic visualization.
    """
    return model.visualize_topics(height=800, top_n_topics=20)


[docs]def get_hierarchical_clusterin(model):
    """
    Wrapper to create the hierachical visualization.

    :param model: The BERTopic model.
    :return: The hierachical visualization.
    """
    return model.visualize_hierarchy()


[docs]def get_topics_bar(model, top_n_topics=9):
    """
    Wrapper to create the barchart visualization.

    :param model: The BERTopic model.
    :return: The barchart visualization.
    """
    return model.visualize_barchart(top_n_topics=top_n_topics, height=800)


[docs]def get_heatmap(model):
    """
    Wrapper to create the heatmap visualization.

    :param model: The BERTopic model.
    :return: The heatmap visualization.
    """
    return model.visualize_heatmap()


[docs]def get_topics_over_time(df, model, documents, topics):
    """
    Wrapper to create the heatmap visualization.

    :param model: The BERTopic model.
    :return: The heatmap visualization.
    """
    timestamps = df["Fecha"]
    return model.visualize_topics_over_time(documents, topics, timestamps)


[docs]def load_model(filename):
    """
    Given a filename, it loads the BERTopic model.

    :param filename: A string that represents the filename.
    :return: The BERTopic model.
    """
    model = BERTopic.load(filename)
    return model


[docs]def load_topics(filename):
    """
    Given a filename, it loads the topics generated when creating the BERTmodel.

    :param filename: A string that represents the filename.
    :return: The BERTopic topics.
    """
    with open(filename, "r") as f:
        topics = json.loads(f.read())["topics"]
        return topics