β οΈ The writing is a work in progress. The functions work but text retouchingβ οΈ
Please read everything found on the mainpage before continuing; disclaimer and all.
In graph theory, a clustering coefficient is a measure of the degree to which nodes in a graph tend to cluster together. Evidence suggests that in most real-world networks, and in particular social networks, nodes tend to create tightly knit groups characterized by a relatively high density of ties; this likelihood tends to be greater than the average probability of a tie randomly established between two nodes (Holland and Leinhardt, 1971; Watts and Strogatz, 1998).
Two versions of this measure exist: the global and the local. The global version was designed to give an overall indication of the clustering in the network, whereas the local gives an indication of the embeddedness of single nodes. - Geek for Geeks
# this example is from the Geek for Geeks link above def average_clustering(G, trials=1000): """Estimates the average clustering coefficient of G. The local clustering of each node in `G` is the fraction of triangles that actually exist over all possible triangles in its neighborhood. The average clustering coefficient of a graph `G` is the mean of local clusterings. This function finds an approximate average clustering coefficient for G by repeating `n` times (defined in `trials`) the following experiment: choose a node at random, choose two of its neighbors at random, and check if they are connected. The approximate coefficient is the fraction of triangles found over the number of trials [1]_. Parameters ---------- G : NetworkX graph trials : integer Number of trials to perform (default 1000). Returns ------- c : float Approximated average clustering coefficient. """ n = len(G) triangles = 0 nodes = G.nodes() for i in [int(random.random() * n) for i in range(trials)]: nbrs = list(G[nodes[i]]) if len(nbrs) < 2: continue u, v = random.sample(nbrs, 2) if u in G[v]: triangles += 1 return triangles / float(trials)
cc=nx.average_clustering(G)
c nx.draw(G)
Data Prep
rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson') # rdf.set_index('CSA2010', drop=True, inplace=True) rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True) vs10to19Ind = rdf.filter(regex='biz1|CSA2010', axis=1)
Get only the columns we want to work with
What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.
vs10to19Indt.columns = vs10to19Indt.iloc[0] vs10to19Indt = vs10to19Indt[1:] vs10to19Indt.index.name = 'variable' vs10to19Indt = vs10to19Indt.astype('float64')
a. Calculate the correlation matrix
cor_matrix contains the full correlation matrix. The table below shows a snapshot of the first 5 rows.
cor_matrix = vs10to19Indt.iloc[:,:].corr() #shows the first 5 rows cor_matrix.head(5)
import matplotlib.pyplot as plt f = plt.figure(figsize=(19, 15)) plt.matshow(df.corr(), fignum=f.number) irange = range(df.select_dtypes(['number']).shape[1]) labels = df.select_dtypes(['number']).columns # plt.xticks(irange, labels, fontsize=14, rotation=45) plt.yticks(irange, labels, fontsize=14) cb = plt.colorbar() cb.ax.tick_params(labelsize=14) plt.title('Correlation Matrix', fontsize=16);
lblVals = cor_matrix.index.values
cor_matrix = np.asmatrix(cor_matrix)
b. Create graph
G = nx.from_numpy_matrix(cor_matrix) #relabels the nodes to match the stocks names G = nx.relabel_nodes(G,lambda x: lblVals[x]) #Shows the first 5 edges with their corresponding edges # OLD: G.edges(data=True)[:5] list(G.edges(data=True))[0:5]
C. Styling the nodes based on the number of edges linked (degree)
#function to create and display networks from the correlatin matrix. def create_corr_network_5(G, corr_direction, min_correlation): ##Creates a copy of the graph H = G.copy() ##Checks all the edges and removes some based on corr_direction for stock1, stock2, weight in G.edges(data=True): ##if we only want to see the positive correlations we then delete the edges with weight smaller than 0 if corr_direction == "positive": ####it adds a minimum value for correlation. ####If correlation weaker than the min, then it deletes the edge if weight["weight"] <0 or weight["weight"] < min_correlation: H.remove_edge(stock1, stock2) ##this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0 else: ####it adds a minimum value for correlation. ####If correlation weaker than the min, then it deletes the edge if weight["weight"] >=0 or weight["weight"] > min_correlation: H.remove_edge(stock1, stock2) #crates a list for edges and for the weights edges,weights = zip(*nx.get_edge_attributes(H,'weight').items()) ### increases the value of weights, so that they are more visible in the graph weights = tuple([(1+abs(x))**2 for x in weights]) #####calculates the degree of each node d = nx.degree(H) #####creates list of nodes and a list their degrees that will be used later for their sizes nodelist, node_sizes = zip(*d) #positions positions=nx.circular_layout(H) #Figure size plt.figure(figsize=(15,15)) #draws nodes nx.draw_networkx_nodes(H,positions,node_color='#DA70D6',nodelist=nodelist, #####the node size will be now based on its degree node_size=tuple([x**3 for x in node_sizes]),alpha=0.8) #Styling for labels nx.draw_networkx_labels(H, positions, font_size=8, font_family='sans-serif') ###edge colors based on weight direction if corr_direction == "positive": edge_colour = plt.cm.GnBu else: edge_colour = plt.cm.PuRd #draws the edges nx.draw_networkx_edges(H, positions, edges,style='solid', ###adds width=weights and edge_color = weights ###so that edges are based on the weight parameter ###edge_cmap is for the color scale based on the weight ### edge_vmin and edge_vmax assign the min and max weights for the width width=weights, edge_color = weights, edge_cmap = edge_colour, edge_vmin = min(weights), edge_vmax=max(weights)) # displays the graph without axis plt.axis('off') #saves image plt.savefig("part5" + corr_direction + ".png", format="PNG") plt.show()
We want to create a linear regression for each CSA using {X: year, Y: value} for a given indicator
import matplotlib.pyplot as plt import pandas as pd from sklearn.linear_model import LinearRegression # Create 3 columns: CSA2010 variable value wdf = vs10to19Ind.melt(id_vars='CSA2010', value_vars=vs10to19Ind.columns[1:]) # Convert indicator labels into our X (Year) column wdf['variable'] = wdf['variable'].apply(lambda x: int(x.replace('biz1_','') ) ) findf = {'CSA':[], 'B':[], 'M':[] } # For each CSA for csa in wdf.CSA2010.unique(): CsaData = wdf[ wdf['CSA2010']==csa] X = CsaData[['variable']] #.values # returns: [10 11 12 13 14 15 16 17 18 19] y = CsaData[['value']] #.values regressor = LinearRegression() regressor.fit(X, y) y_pred = regressor.predict(X) plt.scatter(X, y, color = 'red') plt.plot(X, regressor.predict(X), color = 'blue') plt.title('biz1: '+ csa) plt.xlabel('YEAR') plt.ylabel('VALUE') display( plt.show() ) display( print('B: ', regressor.coef_, 'Y: ', regressor.intercept_) ) findf['CSA'].append(csa) findf['B'].append(regressor.intercept_[0]) findf['M'].append(regressor.coef_[0][0])
lin_reg_dft.columns = lin_reg_dft.iloc[0] lin_reg_dft = lin_reg_dft[1:] lin_reg_dft.index.name = 'variable' lin_reg_dft = lin_reg_dft.astype('float64')
We may need to normalize the data for this to be useable
import matplotlib.pyplot as plt f = plt.figure(figsize=(19, 15)) plt.matshow(df.corr(), fignum=f.number) irange = range(df.select_dtypes(['number']).shape[1]) labels = df.select_dtypes(['number']).columns # plt.xticks(irange, labels, fontsize=14, rotation=45) plt.yticks(irange, labels, fontsize=14) cb = plt.colorbar() cb.ax.tick_params(labelsize=14) plt.title('Correlation Matrix', fontsize=16);
t = """ """ !pip install nbdev from google.colab import drive drive.mount('/content/drive') %cd /content/drive/My Drive/'Software Development Documents'/dataplay/ # !pip install dataplay