import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from matplotlib.pyplot import subplots
from os import path
from PIL import Image
from collections import Counter
import string
import nltk
from nltk.corpus import stopwords
import matplotlib.dates as mdates
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# stop_words para emplear en filtrados:
stop_words = ['#citizenscience', 'citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana']
# Function to create a bargraph:
[docs]def plotbarchart(numberbars, x, y, title=None, xlabel=None, ylabel=None):
"""
Given a number of elements to plot and the elements in x and y axis the function returns a barchart
:param numberbars: Number of elements to plot in the chart
:param x: Elements for x axis
:param y: Elements for y axis, number of appearances of the x elements
:param title: Title for the figure, defaults to None
:param xlabel: Label for the x axis, defaults to None
:param ylabel: Label for the y axis, defaults to None
"""
sns.set()
fig, ax = subplots()
ax.bar(x[:numberbars], y[:numberbars], color="lightsteelblue")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.xlabel(xlabel, fontsize=15)
plt.ylabel(ylabel, fontsize=15)
plt.xticks(rotation=45)
plt.title(title, fontsize=20, fontweight='bold')
plt.tight_layout
[docs]def scatterplot(x, y):
"""
Given the elements for x axis and the number of the elements for y axis the function returns a scatterplot
:param x: Elements for the x axis
:param y: Elements for the y axis, number of appearances of the x elements
"""
plt.figure(figsize=(10, 8))
plt.scatter(x=x, y=y, c="lightsteelblue")
plt.xlabel("Indegree", fontsize=15)
plt.ylabel("Outdegree", fontsize=15)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Función para obtener los subgrafos con NetworkX:
[docs]def get_subgraphs(graph):
"""
Given a networkx Graph the function returns the subgraphs stored in a list
:param graph: Networkx undirected graph
:return: list of subgraphs as networkx objects
"""
components = list(nx.connected_components(graph))
list_subgraphs = []
for component in components:
list_subgraphs.append(graph.subgraph(component))
return list_subgraphs
# Función para convertir a direct graph los subgrafos:
'''def make_weightedDiGraph(ejes):
edges_tupla = [tuple(x) for x in ejes]
G = nx.DiGraph((x, y, {'weight': v}) for (x, y), v in Counter(edges_tupla).items())
return G'''
# Converts subgraphs to direct graphs:
[docs]def direct_subgraphs(subgraphs):
"""
Given a networkx undirected list of subgraph the function returns all the graphs as directed
:param subgraphs: List of undirected networkx subgraphs
:return: List of directed subgraphs as networkx objects
"""
list_directsubgraphs = []
for i in range(len(subgraphs)):
list_directsubgraphs.append(subgraphs[i].to_directed())
return list_directsubgraphs
# Función para filtrar usando el topic que nos interese:
[docs]def filter_by_topic(df, keywords, stopwords):
"""
Given a DataFrame the function returns the dataframe filtered according the given keywords and stopwords
:param df: Dataframe with all the tweets
:param keywords: List of words acting as key to filter the dataframe
:param stopwords: List of words destined to filter out the tweets containing them
:return: DataFrame with the tweets containing the keywords
"""
if keywords:
df = df[df['Texto'].str.contains("|".join(keywords), case=False).any(level=0)]
if stopwords:
df = df[~df['Texto'].str.contains("|".join(stopwords), case=False).any(level=0)]
return df
# Funcition to filter by subtopic:
[docs]def filter_by_subtopic(df, keywords2, stopwords2):
"""
Given a previously filtered DataFrame the function returns the dataframe filtered according to the new subtopic of interest
:param keywords2: List of words acting as key to filter the dataframe
:param stopwords2: List of words destined to filter out the tweets that contain them
:return: DataFrame with the tweets containing the keywords
"""
if keywords2:
df = df[df['Texto'].str.contains("|".join(keywords2), case=False).any(level=0)]
if stopwords2:
df = df[~df['Texto'].str.contains("|".join(stopwords2), case=False).any(level=0)]
return df
# Function to filter by interest:
[docs]def filter_by_interest(df, interest):
"""
Given a non filtered DataFrame the function returns the dataframe filtered by the column interest
:param df: DataFrame with all the tweets
:param interest: Active interest from the different categories available from the Lynguo tool
:return: DataFrame containing the tweets filtered by the selected interest
"""
if interest is str:
df = df[df['Marca'] == interest]
if interest is list:
df = df[df['Marca'].isin(interest)]
if interest is None:
pass
return df
# Calculate Mentions network graph:
[docs]def get_cites(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame containing tweets the function returns those tweets belonging to the citations type, removing the retweets.
The function also applies the filtering processes
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Nested lists containing normal tweet, not retweets, and user who wrote the tweet
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
dfMentions = df[['Usuario', 'Texto']].copy()
dfMentions = dfMentions.dropna()
dfEliminarRTs = dfMentions[dfMentions['Texto'].str.match('RT @')]
dfMentions = dfMentions.drop(dfEliminarRTs.index)
mentionsSubset = dfMentions[['Usuario', 'Texto']]
mentionsList = [list(x) for x in mentionsSubset.to_numpy()]
return mentionsList
# Calculate RT network graph:
# Function to extract as list all the types of tweets, use specially for hashtag analysis (with get_edgesMain):
[docs]def get_all(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame containing all the tweets, the function returns all the tweet and the user who wrote or
retweeted it in a nested list
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Nested lists containing tweet and user
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df_text = df[['Usuario', 'Texto']].copy()
df_text = df_text.dropna()
list_text = df_text['Texto'].to_numpy()
return list_text
# Function to extract edges from RTs, mentions and all:
[docs]def get_edges(values):
"""
Given a list of lists containing tweets or retweets and users the function returns the edges to create a network
:param values: List of lists with the tweet and user
:return: List of lists containing the user and the @ inside the tweet
"""
edges = []
for row in values:
reg = re.search('@(\w+)', row[1])
if reg:
matchRT = reg.group(1) # Se extrae la primera mención que hace referencia a la cuenta retuiteada
# row[1] = hashlib.md5(match.encode()).hexdigest()
row[1] = matchRT # Convierte el nombre de la cuenta en hash y lo asigna al elemento
edges.append(row)
return edges
# Code to create a bar graph of most used hashtags in RTs:
# Selection of rows only with RTs and creation of a list containing the texts:
# Obtain hashtags used in Text:
[docs]def get_edgesHashRT(values):
"""
Given a list containing retweets, the function finds all the hashtags inside the text
:param values: list with the retweets
:return: list with all the hashtags in these retweets
"""
hashtags = []
for row in values:
match = re.findall('#(\w+)', row)
for hashtag in match:
hashtags.append(hashtag)
return hashtags
# Organisation of hashtags by usage and creation of a list containing number of appearances:
# Code to visualise the graph of hashtags in RTs:
[docs]def get_edgesHashRT2(values):
"""
Given a list of list with users and retweets, the function returns the users and the hashtags in their retweets
:param values: List of lists with user and retweet
:return: List of lists, where each list contains user and hashtags
"""
edges = []
for row in values:
match = re.search('#(\w+)', row[1])
if match:
matchHashRT = match.group(1)
row[1] = matchHashRT
edges.append(row)
return edges
#Combination of edges of RTs and Mentions:
[docs]def combined_edges(x,y):
"""
Given the edges from retweets and from mentions the function combine them both
:param x: Edges from retweets
:param y: Edges from mentions
:return: List of lists with the edges combined
"""
combined_edges = x + y
return combined_edges
## Code to show the graph of related hashtags out of RTs. Shows hashtags related to each other
[docs]def get_hashtagsmain(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame containing all the tweets, the function returns a list containing all the mentions
from the DataFrame
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: List with all the tweets which are mentions
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
dfMainHashtags = df[['Usuario', 'Texto']].copy()
dfMainHashtags = dfMainHashtags.dropna()
idx = dfMainHashtags[dfMainHashtags['Texto'].str.match('RT @')]
dfMainHashtags = dfMainHashtags.drop(idx.index)
subset = dfMainHashtags['Texto']
listMainHashtags = subset.to_numpy()
return listMainHashtags
"""def mainHashtags(values):
stop_words = ['#citizenscience', 'citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana']
mainHashtags = []
aristasHashtags = []
for row in values:
match = re.findall('#(\w+)', row.lower())
length = len(match)
try:
match = [word for word in match if word not in stop_words]
except ValueError:
pass
for index,hashtag in enumerate(match):
mainHashtags.append(hashtag)
if index | (length-2):
nextHashtags = match[index+1:length-1]
for nextHashtags in nextHashtags:
aristasHashtags.append([hashtag,nextHashtags])
return aristasHashtags"""
# Creation of the graph of most used hashtags (related to user):
[docs]def get_hashtagsmain2(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame with all the tweets, the function returns a list of list in which each list contains
the user and the written tweet
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: List of lists, each list contains user, written tweet
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
dfMainHashtags = df[['Usuario', 'Texto']].copy()
dfMainHashtags = dfMainHashtags.dropna()
idx = dfMainHashtags[dfMainHashtags['Texto'].str.match('RT @')]
dfMainHashtags = dfMainHashtags.drop(idx.index)
subset = dfMainHashtags[['Usuario','Texto']]
listMainHashtags = [list(x) for x in subset.to_numpy()]
return listMainHashtags
[docs]def get_edgesmain2(values):
"""
Given a list of edges, the function returns the hashtags inside the tweet and relates them to the user
:param values: List of lists containing the edges (user, tweet)
:return: List of lists, in each list the user and the hashtag used by them is stored
"""
stop_words = [['citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana','CitizenScience']]
edges = []
for row in values:
match = re.search('#(\w+)', row[1])
if match:
matchhash = match.group(1)
row[1] = matchhash
edges.append(row)
edges = [i for i in edges if i[1] != stop_words]
return edges
# Creation of the graph of most used hashtags out of RTs(use get_hashtagsmain()):
[docs]def get_edgesMain(values):
"""
Given a list of tweets, the function returns the hashtags inside those tweets.
:param values: List of tweets
:return: List with the hashtags inside the tweets
"""
hashtags = []
for row in values:
match = re.findall('#(\w+)', row.lower())
for hashtag in match:
hashtags.append(hashtag)
return hashtags
# Hashtags from the Bot:
botwords=['airpollution', 'luftdaten', 'fijnstof', 'waalre', 'pm2', 'pm10']
[docs]def prepare_hashtagsmain(list, stopwords=None):
"""
Given a list of hashtags, the function returns the number of appearances of each hashtag and a unique list of hashtags.
:param list: List of hashtags
:param stopwords: List of words destined to filter out the desired hashtags from the list
:return: Ordered list with the number of appearances of each hashtag and a list of unique hashtags
"""
citsci_words = ['#citizenscience', 'citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana']
lista = [x.lower() for x in list]
lista = [word for word in lista if word not in citsci_words]
if stopwords != None:
lista = [word for word in lista if word not in stopwords]
mainHashtags = np.unique(lista,return_counts=True)
mainHashtags = sorted((zip(mainHashtags[1], mainHashtags[0])), reverse=True)
sortedNumberHashtags, sortedMainHashtags = zip(*mainHashtags)
return sortedNumberHashtags,sortedMainHashtags
"""def get_prop_type(value, key=None):
'''
Performs typing and value conversion for the graph_tool PropertyMap class.
If a key is provided, it also ensures the key is in a format that can be
used with the PropertyMap. Returns a tuple, (type name, value, key)
'''
# Deal with the value
if isinstance(value, bool):
tname = 'bool'
elif isinstance(value, int):
tname = 'float'
value = float(value)
elif isinstance(value, float):
tname = 'float'
elif isinstance(value, dict):
tname = 'object'
else:
tname = 'string'
value = str(value)
return tname, value, key
def nx2gt(nxG):
'''
Converts a networkx graph to a graph-tool graph.
'''
# Phase 0: Create a directed or undirected graph-tool Graph
gtG = gt.Graph(directed=nxG.is_directed())
# Add the Graph properties as "internal properties"
for key, value in nxG.graph.items():
# Convert the value and key into a type for graph-tool
tname, value, key = get_prop_type(value, key)
prop = gtG.new_graph_property(tname) # Create the PropertyMap
gtG.graph_properties[key] = prop # Set the PropertyMap
gtG.graph_properties[key] = value # Set the actual value
# Phase 1: Add the vertex and edge property maps
# Go through all nodes and edges and add seen properties
# Add the node properties first
nprops = set() # cache keys to only add properties once
for node, data in nxG.nodes(data=True):
# Go through all the properties if not seen and add them.
for key, val in data.items():
if key in nprops:
continue # Skip properties already added
# Convert the value and key into a type for graph-tool
tname, _, key = get_prop_type(val, key)
prop = gtG.new_vertex_property(tname) # Create the PropertyMap
gtG.vertex_properties[key] = prop # Set the PropertyMap
# Add the key to the already seen properties
nprops.add(key)
# Also add the node id: in NetworkX a node can be any hashable type, but
# in graph-tool node are defined as indices. So we capture any strings
# in a special PropertyMap called 'id' -- modify as needed!
gtG.vertex_properties['id'] = gtG.new_vertex_property('string')
# Add the edge properties second
eprops = set() # cache keys to only add properties once
for src, dst, data in nxG.edges(data=True):
# Go through all the edge properties if not seen and add them.
for key, val in data.items():
if key in eprops:
continue # Skip properties already added
# Convert the value and key into a type for graph-tool
tname, _, key = get_prop_type(val, key)
prop = gtG.new_edge_property(tname) # Create the PropertyMap
gtG.edge_properties[key] = prop # Set the PropertyMap
# Add the key to the already seen properties
eprops.add(key)
# Phase 2: Actually add all the nodes and vertices with their properties
# Add the nodes
vertices = {} # vertex mapping for tracking edges later
for node, data in nxG.nodes(data=True):
# Create the vertex and annotate for our edges later
v = gtG.add_vertex()
vertices[node] = v
# Set the vertex properties, not forgetting the id property
data['id'] = str(node)
for key, value in data.items():
gtG.vp[key][v] = value # vp is short for vertex_properties
# Add the edges
for src, dst, data in nxG.edges(data=True):
# Look up the vertex structs from our vertices mapping and add edge.
e = gtG.add_edge(vertices[src], vertices[dst])
# Add the edge properties
for key, value in data.items():
gtG.ep[key][e] = value # ep is short for edge_properties
# Done, finally!
return gtG"""
# Función para extraer los valores de degree,outdegree, eigenvector y betweenness y crear un csv:
[docs]def get_degrees(G):
"""
Given a Networkx directed graph, the function returns a CSV file containing the centrality measures of
the graph (Indegree, Outdegree, Betweenness and Eigenvector) sorted by indegree
:param G: Networkx directed graph
:return: Dataframe with the users, centrality measures and rank based of those measures, sorted by the indegree
"""
import networkit as nk
from operator import itemgetter
n_g = nk.nxadapter.nx2nk(G)
idmap = dict((u, id) for (id, u) in zip(G.nodes(), range(G.number_of_nodes())))
btwn = nk.centrality.Betweenness(n_g)
ec = nk.centrality.EigenvectorCentrality(n_g)
ec.run()
btwn.run()
bt_results = sorted(btwn.ranking(), key=itemgetter(0))
bt_results = [round(value,4) for id_, value in bt_results]
ec_results = sorted(ec.ranking(), key=itemgetter(0))
ec_results = [round(value,4) for id_, value in ec_results]
names = []
in_degrees = []
out_degrees = []
nodes = n_g.iterNodes()
for key in nodes:
names.append(idmap[key])
in_degrees.append(n_g.degreeIn(key))
out_degrees.append(n_g.degreeOut(key))
return pd.DataFrame({"Name": names, "InD.": in_degrees, "OutD.": out_degrees, "Eigen C.": ec_results, "Betweenness C": bt_results})
"""def csv_degval(Digraph, filename):
list_values = []
outdegrees2 = dict(Digraph.out_degree())
indegrees = dict(Digraph.in_degree())
centrality = dict(nx.eigenvector_centrality(Digraph))
betweenness = dict(nx.betweenness_centrality(Digraph))
indegtupl = sorted([(k, v) for k, v in indegrees.items()], key=lambda x:x[1], reverse=True)
indegtupl = indegtupl[0:10]
names = [i[0] for i in indegtupl]
outdegtupl = sorted([(k,v) for k,v in outdegrees2.items()], key=lambda x:x[1], reverse=True)
centraltupl = sorted([(k,v) for k,v in centrality.items()], key=lambda x:x[1], reverse=True)
betwentupl = sorted([(k,v) for k,v in betweenness.items()], key=lambda x:x[1], reverse=True)
for name in names:
pos_indeg = [y[0] for y in indegtupl].index(name)
rank_indeg = pos_indeg + 1
indeg_val = indegtupl[pos_indeg][1]
pos_outdeg = [y[0] for y in outdegtupl].index(name)
rank_outdeg = pos_outdeg + 1
outdeg_val = outdegtupl[pos_outdeg][1]
pos_central = [y[0] for y in centraltupl].index(name)
rank_central = pos_central + 1
central_val = centraltupl[pos_central][1]
central_val = round(centraltupl[pos_central][1],6)
pos_between = [y[0] for y in betwentupl].index(name)
rank_between = pos_between + 1
between_val = round(betwentupl[pos_between][1], 6)
list_values.append((name, indeg_val, rank_indeg, outdeg_val, rank_outdeg, central_val, rank_central,
between_val, rank_between))
df = pd.DataFrame(list_values,
columns=['Name', 'Indegree', 'R.In', 'Outdegree', 'R.Out', 'Eigenvector', 'R.EI', 'Betweenness',
'R.Bet'])
return df"""
## Functions to obtain elements for two mode:
# RTs:
[docs]def get_twomodeRT(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame containing al the tweets, the function returns a bipartite graph with the users and the
retweets as nodes. The retweets are displayed as weighted nodes
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Networkx bipartite graph
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
dfRT = df[['Usuario', 'Texto']].copy()
idx = dfRT['Texto'].str.contains('RT @', na=False)
dfRT = dfRT[idx]
subset = dfRT[['Usuario', 'Texto']]
subset = subset.drop_duplicates()
u = list(subset['Usuario'])
v = list(subset['Texto'])
edges_tuple = [tuple(x) for x in subset.to_numpy()]
G = nx.Graph()
G.add_nodes_from(set(u), bipartite=0)
G.add_nodes_from(set(v), bipartite=1)
G.add_edges_from((x, y, {'weight': v}) for (x, y), v in Counter(edges_tuple).items())
print(len(G.nodes))
if len(G.nodes) >= 2000:
G = nx.k_core(G, k=2)
else:
G = nx.k_core(G, k=1)
counter = Counter(list((nx.core_number(G).values())))
print(counter)
pos = {}
pos.update((node, (1, index)) for index, node in enumerate(set(u)))
pos.update((node, (2, index)) for index, node in enumerate(set(v)))
return G
# Obtaining components for two mode for hashtags outside RTs:
[docs]def get_twomodeHashMain(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None, filter_hashtags=None):
"""
Given a DataFrame with all the tweets, the function returns a networkx bipartite graph with
the users and hashtags (outside retweets) as nodes
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:param filter_hashtags: Boolean, to remove the predefined citizen science most common hashtags
:return: Networkx bipartite graph
"""
edges = []
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
dfMain = df[['Usuario', 'Texto']].copy()
dfMain = dfMain.dropna()
dfEliminarRTs = dfMain[dfMain['Texto'].str.match('RT @')]
dfMain = dfMain.drop(dfEliminarRTs.index)
subset = dfMain[['Usuario', 'Texto']]
subset = subset.drop_duplicates()
listHT = [list(x) for x in subset.to_numpy()]
stop_words = ['CitizenScience ','citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana','CitizenScience']
filter_edges = []
u = []
v = []
for row in listHT:
match = re.search('#(\w+)', row[1])
if match:
matchhash = match.group(1)
row[1] = matchhash
edges.append(row)
if filter_hashtags == True:
for edge in edges:
stop = False
for word in edge:
# print(word, word.lower() in stop_words)
if word.lower() in stop_words:
stop = True
if not stop:
filter_edges.append(edge)
if filter_hashtags == True:
u = [x[0] for x in filter_edges]
v = [x[1] for x in filter_edges]
edges_tuple = [tuple(x) for x in filter_edges]
else:
u = [x[0] for x in edges]
v = [x[1] for x in edges]
edges_tuple = [tuple(x) for x in edges]
G = nx.Graph()
G.add_nodes_from(set(u), bipartite=0)
G.add_nodes_from(set(v), bipartite=1)
G.add_edges_from((x, y, {'weight': v}) for (x, y), v in Counter(edges_tuple).items())
print(len(G.nodes))
G.remove_edges_from(nx.selfloop_edges(G))
if len(G.nodes) >= 2000:
G = nx.k_core(G, k=2)
else:
G = nx.k_core(G, k=1)
counter = Counter(list((nx.core_number(G).values())))
print(counter)
pos = {}
pos.update((node, (1, index)) for index, node in enumerate(set(u)))
pos.update((node, (2, index)) for index, node in enumerate(set(v)))
return G
# Function to obtain the components for the two mode for hashtags in RTs:
[docs]def get_twomodeHashRT(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None,
interest=None, filter_hashtags=None):
"""
Given a DataFrame with all the tweets, the function returns a networkx bipartite graph with the
users and hashtags (inside retweets) as nodes
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:param filter_hashtags: Boolean, to remove the predefined citizen science most common hashtags
:return: Networkx bipartite graph
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df = df[['Usuario', 'Texto']].copy()
df = df.dropna()
idx = df['Texto'].str.contains('RT @', na=False)
dfRT = df[idx] # Se seleccionan sólo las filas conpython RT
subset = dfRT[['Usuario', 'Texto']]
listHT = [list(x) for x in subset.to_numpy()]
edges = []
stop_words = ['CitizenScience', 'citizenScience','citizenscience', 'rt', 'citizen', 'science',
'citsci', 'cienciaciudadana', '#CitizenScience']
filter_edges = []
u = []
v = []
for row in listHT:
match = re.search('#(\w+)', row[1])
if match:
matchhash = match.group(1)
row[1] = matchhash
edges.append(row)
if filter_hashtags == True:
for edge in edges:
stop = False
for word in edge:
#print(word, word.lower() in stop_words)
if word.lower() in stop_words:
stop = True
if not stop:
filter_edges.append(edge)
if filter_hashtags == True:
u = [x[0] for x in filter_edges]
v = [x[1] for x in filter_edges]
edges_tuple = [tuple(x) for x in filter_edges]
else:
u = [x[0] for x in edges]
v = [x[1] for x in edges]
edges_tuple = [tuple(x) for x in edges]
G = nx.Graph()
G.add_nodes_from(set(u), bipartite=0)
G.add_nodes_from(set(v), bipartite=1)
G.add_edges_from((x, y, {'weight': v}) for (x, y), v in Counter(edges_tuple).items())
print(len(G.nodes))
G.remove_edges_from(nx.selfloop_edges(G))
if len(G.nodes) >= 2000:
G = nx.k_core(G, k=2)
else:
G = nx.k_core(G, k=1)
counter = Counter(list((nx.core_number(G).values())))
print(counter)
pos = {}
pos.update((node, (1, index)) for index, node in enumerate(set(u)))
pos.update((node, (2, index)) for index, node in enumerate(set(v)))
return G
# Wordcloud function for main hashtags:
[docs]def wordcloudmain(df, keywords=None, stopwords=None, interest=None ):
"""
Given a DataFrame containing all the tweets, the function returns a wordcloud with the hashtags
(outside retweets) displayed by frequency
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Wordcloud with the hashtags by frequency
"""
hashtags =[]
stop_words = ['citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana', 'CitizenScience']
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = df[['Usuario', 'Texto']]
df = df.dropna()
idx = df[df['Texto'].str.match('RT @')]
df = df.drop(idx.index)
subset = df['Texto']
for row in subset:
match = re.findall('#(\w+)', row.lower())
for hashtag in match:
hashtags.append(hashtag)
unique_string = (' ').join(hashtags)
wordcloud = WordCloud(width=900, height=900, background_color='white', stopwords=stop_words,
min_font_size=10, max_words=10405, collocations=False, colormap='winter').generate(unique_string)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig("wc.png")
# Wordcloud for main hashtags plotted inside a logo:
[docs]def wordcloud_mainhtlogo(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None, image=None):
"""
Given a DataFrame containing all the tweets, the function returns a wordcloud with the hashtags
(outside retweets) displayed by frequency
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:param image: Image file to plot the wordcloud inside
:return: Wordcloud inside desired image with the hashtags by frequency
"""
hashtags =[]
stop_words = ['citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana', 'CitizenScience']
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df = df[['Usuario', 'Texto']].copy()
df = df.dropna()
idx = df[df['Texto'].str.match('RT @')]
df = df.drop(idx.index)
subset = df['Texto']
for row in subset:
match = re.findall('#(\w+)', row.lower())
for hashtag in match:
hashtags.append(hashtag)
unique_string = (' ').join(hashtags)
logo = np.array(Image.open(image))
transformed_logo = np.ndarray((logo.shape[0], logo.shape[1]), np.int32)
for i in range(len(logo)):
transformed_logo[i] = list(map(transform_format, logo[i]))
wc = WordCloud(width = 900, height = 900,
background_color ='ghostwhite',
stopwords = stop_words,
min_font_size = 5, max_font_size=30, max_words=10405, collocations=False,mask=transformed_logo,
contour_width=2, contour_color='cornflowerblue',mode='RGB', colormap='summer').generate(unique_string)
plt.figure(figsize=[25, 10])
plt.imshow(wc)
plt.axis("off")
plt.show()
# Wordlcoud for hashtags in the RTs:
[docs]def wordcloudRT(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None ):
"""
Given a DataFrame containing all the tweets, the function returns a wordcloud with the hashtags
(inside retweets) displayed by frequency
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Wordcloud with the hashtags by frequency
"""
hashtags =[]
stop_words = ['citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana', 'CitizenScience']
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df = df[['Usuario', 'Texto']].copy()
df = df.dropna()
idx = df['Texto'].str.contains('RT @', na=False)
df = df[idx]
subset = df['Texto']
for row in subset:
match = re.findall('#(\w+)', row.lower())
for hashtag in match:
hashtags.append(hashtag)
unique_string = (' ').join(hashtags)
wordcloud = WordCloud(width=900, height=900, background_color='white', stopwords=stop_words,
min_font_size=10, max_words=10405, collocations=False, colormap='winter').generate(unique_string)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
[docs]def wordcloudRT_logo(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None, image=None):
"""
Given a DataFrame containing all the tweets, the function returns a wordcloud with the hashtags
(inside retweets) displayed by frequency
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:param image: Image file to plot the wordcloud inside
:return: Wordcloud inside desired image with the hashtags by frequency
"""
hashtags = []
stop_words = ['citizenscience', 'rt', 'citizen', 'science', 'citsci', 'cienciaciudadana', 'CitizenScience']
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df = df[['Usuario', 'Texto']].copy()
df = df.dropna()
idx = df['Texto'].str.contains('RT @', na=False)
df = df[idx]
subset = df['Texto']
for row in subset:
match = re.findall('#(\w+)', row.lower())
for hashtag in match:
hashtags.append(hashtag)
unique_string = (' ').join(hashtags)
logo = np.array(Image.open(image))
transformed_logo = np.ndarray((logo.shape[0], logo.shape[1]), np.int32)
for i in range(len(logo)):
transformed_logo[i] = list(map(transform_format, logo[i]))
wc = WordCloud(width=900, height=900,
background_color='ghostwhite',
stopwords=stop_words,
min_font_size=5, max_font_size=30, max_words=10405, collocations=False, mask=transformed_logo,
contour_width=2, contour_color='cornflowerblue', mode='RGB', colormap='summer').generate(
unique_string)
plt.figure(figsize=[25, 10])
plt.imshow(wc)
plt.axis("off")
plt.show()
# Cálculo de las palabras más usadas:
# La función emplea la columna texto y podemos añadir un número n que indica cuantas palabras
# Calculation of most used words:
# The function uses the column Text, we can select the number of words plotted:
[docs]def most_common(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame containing all the tweets, the function returns a dictionary with the most used words and
the number of appearances, a list of these words and a list with the number of times these words appear
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Tuples dict containing word and number of times, list with the words and list with the times these words appear
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df = df[['Usuario', 'Texto']].copy()
df = df.dropna()
subset = df['Texto']
subset = subset.dropna()
words = " ".join(subset).lower().split()
token = pos_tag(words, tagset='universal', lang='eng')
word_list = [t[0] for t in token if (t[1] == 'NOUN', t[1] == 'VERB', t[1] == 'ADJ')]
words = []
for word in word_list:
match = re.findall("\A[a-z-A-Z]+", word)
for object in match:
words.append(word)
regex = re.compile(r'htt(\w+)')
words = [word for word in words if not regex.match(word)]
count_word = Counter(words)
s = stopwords.words('english')
e = stopwords.words('spanish')
r = STOPWORDS
d = stopwords.words('german')
p = string.punctuation
new_elements = (
'\\n', 'rt', '?', '¿', '&', 'that?s', '??', '-', 'the', 'to', 'co', 'n', 'https', 'we?re', 'everyone?s',
'supporters?', 'z', 'here:', 'science,', 'project.', 'citizen', 'science', 'us', 'student?', 'centre?', 'science?',
')', 'media?)', 'education?', 'reuse,', 'older!', 'scientists?', 'don?t', 'it?s', 'i?m', 'w/', 'w', 'more:')
s.extend(new_elements)
s.extend(e)
s.extend(r)
s.extend(d)
s.extend(p)
stopset = set(s)
for word in stopset:
del count_word[word]
tuples_dict = sorted(count_word.items(), key=lambda x: x[1], reverse=True)
words_pt = []
numbers_pt = []
for tuple in tuples_dict:
words_pt.append(tuple[0])
numbers_pt.append(tuple[1])
return tuples_dict, words_pt, numbers_pt
# Top palabras más usadas en wordcloud:
[docs]def most_commonwc(df, keywords=None, stopwords=None, keywords2=None, stopwords2=None, interest=None):
"""
Given a DataFrame containing all the tweets, the function returns a wordcloud with the most used words in these tweets
:param df: DataFrame with all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param keywords2: List of words acting as key to filter the DataFrame according to a subtopic
:param stopwords2: List of words destined to filter out the tweets that contain them according to a subtopic
:param interest: Active interest from the different categories available from the Lynguo tool
:return: Wordcloud with the most used words displayed in it
"""
df = filter_by_interest(df, interest)
df = filter_by_topic(df, keywords, stopwords)
df = filter_by_subtopic(df, keywords2, stopwords2)
df = df[['Usuario', 'Texto']].copy()
df = df.dropna()
subset = df['Texto']
subset = subset.dropna()
words = " ".join(subset).lower().split()
token = pos_tag(words, tagset='universal', lang='eng')
word_list = [t[0] for t in token if (t[1] == 'NOUN', t[1] == 'VERB', t[1] == 'ADJ')]
words = []
for word in word_list:
match = re.findall("\A[a-z-A-Z]+", word)
for object in match:
words.append(word)
regex = re.compile(r'htt(\w+)')
words = [word for word in words if not regex.match(word)]
count_word = Counter(words)
s = stopwords.words('english')
e = stopwords.words('spanish')
r = STOPWORDS
d = stopwords.words('german')
p = string.punctuation
new_elements = (
'\\n', 'rt', '?', '¿', '&', 'that?s', '??', '-', 'the', 'to', 'co', 'n', 'https', 'we?re', 'everyone?s',
'supporters?', 'z', 'here:', 'science,', 'project.', 'citizen', 'science', 'us', 'student?', 'centre?', 'science?',
')', 'media?)', 'education?', 'reuse,', 'older!', 'scientists?', 'don?t', 'it?s', 'i?m', 'w/', 'w', 'more:')
s.extend(new_elements)
s.extend(e)
s.extend(r)
s.extend(d)
s.extend(p)
stopset = set(s)
for word in stopset:
del count_word[word]
wordcloud = WordCloud(width=900, height=900, background_color='white', stopwords=stopset,
min_font_size=10, max_words=300, collocations=False,
colormap='winter').generate_from_frequencies(count_word)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show
[docs]def sentiment_analyser(df_entry,keywords=None, stopwords=None, interest=None):
"""
Given a DataFrame containing all the tweets,the function returns a CSV containing the user and the score
for each tweet from Vader sentiment
:param df_entry: A DataFrame containing all the tweets
:param keywords: List of words acting as key to filter the DataFrame
:param stopwords: List of words destined to filter out the tweets that contain them
:param interest: Active interest from the different categories available from the Lynguo tool
:return: CSV with the columns User, Text and Sentiment
"""
analyser = SentimentIntensityAnalyzer()
df = filter_by_interest(df_entry, interest)
df = filter_by_topic(df, keywords, stopwords)
df = df[['Texto', 'Usuario']]
df = df.dropna()
Users = df['Usuario']
Texto = df['Texto']
sentences = Texto
list_of_dicts = []
for sentence in sentences:
adict = analyser.polarity_scores(sentence)
print(adict)
list_of_dicts.append(adict)
df_sentiment = pd.DataFrame(list_of_dicts)
df_sentiment['Usuario'] = Users
df_sentiment['Texto'] = Texto
return df_sentiment
#Función para extraer los días:
[docs]def getDays(df):
df = df['Fecha']
days = pd.unique(df)
days.sort()
return days
[docs]def make_weightedDiGraph(ejes):
edges_tupla = [tuple(x) for x in ejes]
G = nx.DiGraph((x, y, {'weight': v}) for (x, y), v in Counter(edges_tupla).items())
return G