Source code for berttopic_utils

""" This module provides functionality to create topic modelling models and visualizations.
"""

from nltk.corpus import stopwords
from generate_utils import filter_by_topic
from bertopic import BERTopic
import json
import re


[docs]def remove_stopwords(texts, stop_words): """ Given a list of sentences, it removes the stop_words from each sentence. :param texts: List of sentences (strings) :param stop_words: List of words (strings) to remove. :return: List of sentences (strings) without the stop words. """ list_texts = [] for text in texts: words = text.split(" ") final_str = "" for word in words: if word not in stop_words and len(word) > 3: final_str += word + " " final_str = final_str.strip() list_texts.append(final_str) return list_texts
[docs]def get_cleaned_documents(df_original): """ Given a df with a column Texto (tweets), this function preprocess the texts of that column removing punctuation, common words, stop words and urls. :param df_original: A df as with the tweets. :return: A list with the tweets (strings). """ df = df_original.copy() df["Texto"] = df["Texto"].apply(str) # df["Texto"] = df["Texto"].map(lambda x: re.sub("[,\.!?#]^¿¡","",x)) df['Texto'] = df['Texto'].str.replace( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ') df["Texto"] = df["Texto"].str.replace('[^\w\s]', '') df["Texto"] = df["Texto"].map(lambda x: x.lower()) df["Texto"] = df["Texto"].map(lambda x: re.sub("climate change", "climatechange", x)) stop_words = stopwords.words("english") stop_words.extend(stopwords.words("spanish")) stop_words.extend( ["from", "citizenscience", "citizen science", "citizen", "science", "ciencia", "need", "thank", "project", "projects"]) documents = df["Texto"].values.tolist() documents = remove_stopwords(documents, stop_words) return documents
[docs]def create_bert_model(documents): """ Given a list of sentences, it creates a BERTopic model with them. :param documents: List of sentences (strings). :return: The BERTopic model, the topics and the probabilities. """ model = BERTopic(verbose=True, language="multilingual", min_topic_size=100) topics, probs = model.fit_transform(documents) return model, topics, probs
[docs]def get_intertopic_distance(model, top_n_topics=20): """ Wrapper to create the intertopic distance visualization :param model: The BERTopic model. :param top_n_topics: The number of topics to show. :return: The intertopic visualization. """ return model.visualize_topics(height=800, top_n_topics=20)
[docs]def get_hierarchical_clusterin(model): """ Wrapper to create the hierachical visualization. :param model: The BERTopic model. :return: The hierachical visualization. """ return model.visualize_hierarchy()
[docs]def get_topics_bar(model, top_n_topics=9): """ Wrapper to create the barchart visualization. :param model: The BERTopic model. :return: The barchart visualization. """ return model.visualize_barchart(top_n_topics=top_n_topics, height=800)
[docs]def get_heatmap(model): """ Wrapper to create the heatmap visualization. :param model: The BERTopic model. :return: The heatmap visualization. """ return model.visualize_heatmap()
[docs]def get_topics_over_time(df, model, documents, topics): """ Wrapper to create the heatmap visualization. :param model: The BERTopic model. :return: The heatmap visualization. """ timestamps = df["Fecha"] return model.visualize_topics_over_time(documents, topics, timestamps)
[docs]def load_model(filename): """ Given a filename, it loads the BERTopic model. :param filename: A string that represents the filename. :return: The BERTopic model. """ model = BERTopic.load(filename) return model
[docs]def load_topics(filename): """ Given a filename, it loads the topics generated when creating the BERTmodel. :param filename: A string that represents the filename. :return: The BERTopic topics. """ with open(filename, "r") as f: topics = json.loads(f.read())["topics"] return topics