""" This module provides functionality to create topic modelling models and visualizations.
"""
from nltk.corpus import stopwords
from generate_utils import filter_by_topic
from bertopic import BERTopic
import json
import re
[docs]def remove_stopwords(texts, stop_words):
"""
Given a list of sentences, it removes the stop_words from each sentence.
:param texts: List of sentences (strings)
:param stop_words: List of words (strings) to remove.
:return: List of sentences (strings) without the stop words.
"""
list_texts = []
for text in texts:
words = text.split(" ")
final_str = ""
for word in words:
if word not in stop_words and len(word) > 3:
final_str += word + " "
final_str = final_str.strip()
list_texts.append(final_str)
return list_texts
[docs]def get_cleaned_documents(df_original):
"""
Given a df with a column Texto (tweets), this function preprocess the texts of that column removing punctuation, common
words, stop words and urls.
:param df_original: A df as with the tweets.
:return: A list with the tweets (strings).
"""
df = df_original.copy()
df["Texto"] = df["Texto"].apply(str)
# df["Texto"] = df["Texto"].map(lambda x: re.sub("[,\.!?#]^¿¡","",x))
df['Texto'] = df['Texto'].str.replace(
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
df["Texto"] = df["Texto"].str.replace('[^\w\s]', '')
df["Texto"] = df["Texto"].map(lambda x: x.lower())
df["Texto"] = df["Texto"].map(lambda x: re.sub("climate change", "climatechange", x))
stop_words = stopwords.words("english")
stop_words.extend(stopwords.words("spanish"))
stop_words.extend(
["from", "citizenscience", "citizen science", "citizen", "science", "ciencia", "need", "thank", "project",
"projects"])
documents = df["Texto"].values.tolist()
documents = remove_stopwords(documents, stop_words)
return documents
[docs]def create_bert_model(documents):
"""
Given a list of sentences, it creates a BERTopic model with them.
:param documents: List of sentences (strings).
:return: The BERTopic model, the topics and the probabilities.
"""
model = BERTopic(verbose=True, language="multilingual", min_topic_size=100)
topics, probs = model.fit_transform(documents)
return model, topics, probs
[docs]def get_intertopic_distance(model, top_n_topics=20):
"""
Wrapper to create the intertopic distance visualization
:param model: The BERTopic model.
:param top_n_topics: The number of topics to show.
:return: The intertopic visualization.
"""
return model.visualize_topics(height=800, top_n_topics=20)
[docs]def get_hierarchical_clusterin(model):
"""
Wrapper to create the hierachical visualization.
:param model: The BERTopic model.
:return: The hierachical visualization.
"""
return model.visualize_hierarchy()
[docs]def get_topics_bar(model, top_n_topics=9):
"""
Wrapper to create the barchart visualization.
:param model: The BERTopic model.
:return: The barchart visualization.
"""
return model.visualize_barchart(top_n_topics=top_n_topics, height=800)
[docs]def get_heatmap(model):
"""
Wrapper to create the heatmap visualization.
:param model: The BERTopic model.
:return: The heatmap visualization.
"""
return model.visualize_heatmap()
[docs]def get_topics_over_time(df, model, documents, topics):
"""
Wrapper to create the heatmap visualization.
:param model: The BERTopic model.
:return: The heatmap visualization.
"""
timestamps = df["Fecha"]
return model.visualize_topics_over_time(documents, topics, timestamps)
[docs]def load_model(filename):
"""
Given a filename, it loads the BERTopic model.
:param filename: A string that represents the filename.
:return: The BERTopic model.
"""
model = BERTopic.load(filename)
return model
[docs]def load_topics(filename):
"""
Given a filename, it loads the topics generated when creating the BERTmodel.
:param filename: A string that represents the filename.
:return: The BERTopic topics.
"""
with open(filename, "r") as f:
topics = json.loads(f.read())["topics"]
return topics