Home 00 Dataplay 01 Download And Load 02 Merge Data 03 Map Basics Intake... 04 Nb 2 Html 05 Map Correlation N...06 Timelapse Data Gi...

Don't Look! I'm changing!

URL Copied

Musical loops

⚠️ The writing is a work in progress. The functions work but text retouching⚠️

Please read everything found on the mainpage before continuing; disclaimer and all.

BinderBinderBinderOpen Source Love svg3

NPM LicenseActivePython VersionsGitHub last commit

GitHub starsGitHub watchersGitHub forksGitHub followers

TweetTwitter Follow

In graph theory, a clustering coefficient is a measure of the degree to which nodes in a graph tend to cluster together. Evidence suggests that in most real-world networks, and in particular social networks, nodes tend to create tightly knit groups characterized by a relatively high density of ties; this likelihood tends to be greater than the average probability of a tie randomly established between two nodes (Holland and Leinhardt, 1971; Watts and Strogatz, 1998).

Two versions of this measure exist: the global and the local. The global version was designed to give an overall indication of the clustering in the network, whereas the local gives an indication of the embeddedness of single nodes. - Geek for Geeks

 # this example is from the Geek for Geeks link above
 def average_clustering(G, trials=1000): 
     """Estimates the average clustering coefficient of G. 
   
     The local clustering of each node in `G` is the  
     fraction of triangles that actually exist over  
     all possible triangles in its neighborhood. 
     The average clustering coefficient of a graph  
     `G` is the mean of local clusterings. 
   
     This function finds an approximate average  
     clustering coefficient for G by repeating `n`  
     times (defined in `trials`) the following 
     experiment: choose a node at random, choose  
     two of its neighbors at random, and check if 
     they are connected. The approximate coefficient  
     is the fraction of triangles found over the  
     number of trials [1]_. 
   
     Parameters 
     ---------- 
     G : NetworkX graph 
   
     trials : integer 
         Number of trials to perform (default 1000). 
   
     Returns 
     ------- 
     c : float 
         Approximated average clustering coefficient. 
   
      
   
     """
     n = len(G) 
     triangles = 0
     nodes = G.nodes() 
     for i in [int(random.random() * n) for i in range(trials)]: 
         nbrs = list(G[nodes[i]]) 
         if len(nbrs) < 2: 
             continue
         u, v = random.sample(nbrs, 2) 
         if u in G[v]: 
             triangles += 1
     return triangles / float(trials) 
cc=nx.average_clustering(G) 
c  
 nx.draw(G)

Data Prep

rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson')
 # rdf.set_index('CSA2010', drop=True, inplace=True)
 rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True)
 
 vs10to19Ind = rdf.filter(regex='biz1|CSA2010', axis=1)

Get only the columns we want to work with

What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.

vs10to19Indt.columns = vs10to19Indt.iloc[0]
 vs10to19Indt = vs10to19Indt[1:]
 vs10to19Indt.index.name = 'variable'
 vs10to19Indt = vs10to19Indt.astype('float64')

a. Calculate the correlation matrix

cor_matrix contains the full correlation matrix. The table below shows a snapshot of the first 5 rows.

cor_matrix = vs10to19Indt.iloc[:,:].corr()
 #shows the first 5 rows
 cor_matrix.head(5)
import matplotlib.pyplot as plt
 f = plt.figure(figsize=(19, 15))
 plt.matshow(df.corr(), fignum=f.number)
 irange = range(df.select_dtypes(['number']).shape[1])
 labels = df.select_dtypes(['number']).columns
 # plt.xticks(irange, labels, fontsize=14, rotation=45)
 plt.yticks(irange, labels, fontsize=14)
 cb = plt.colorbar()
 cb.ax.tick_params(labelsize=14)
 plt.title('Correlation Matrix', fontsize=16);
lblVals = cor_matrix.index.values
cor_matrix = np.asmatrix(cor_matrix)

b. Create graph

G = nx.from_numpy_matrix(cor_matrix)
 
 #relabels the nodes to match the  stocks names
 G = nx.relabel_nodes(G,lambda x: lblVals[x])
 
 #Shows the first 5 edges with their corresponding edges
 # OLD: G.edges(data=True)[:5]
 list(G.edges(data=True))[0:5]

C. Styling the nodes based on the number of edges linked (degree)

#function to create and display networks from the correlatin matrix. 
 
 def create_corr_network_5(G, corr_direction, min_correlation):
     ##Creates a copy of the graph
     H = G.copy()
     
     ##Checks all the edges and removes some based on corr_direction
     for stock1, stock2, weight in G.edges(data=True):
         ##if we only want to see the positive correlations we then delete the edges with weight smaller than 0        
         if corr_direction == "positive":
             ####it adds a minimum value for correlation. 
             ####If correlation weaker than the min, then it deletes the edge
             if weight["weight"] <0 or weight["weight"] < min_correlation:
                 H.remove_edge(stock1, stock2)
         ##this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0
         else:
             ####it adds a minimum value for correlation. 
             ####If correlation weaker than the min, then it deletes the edge
             if weight["weight"] >=0 or weight["weight"] > min_correlation:
                 H.remove_edge(stock1, stock2)
                 
     
     #crates a list for edges and for the weights
     edges,weights = zip(*nx.get_edge_attributes(H,'weight').items())
     
     ### increases the value of weights, so that they are more visible in the graph
     weights = tuple([(1+abs(x))**2 for x in weights])
     
     #####calculates the degree of each node
     d = nx.degree(H)
     #####creates list of nodes and a list their degrees that will be used later for their sizes
     nodelist, node_sizes = zip(*d)
 
     #positions
     positions=nx.circular_layout(H)
     
     #Figure size
     plt.figure(figsize=(15,15))
 
     #draws nodes
     nx.draw_networkx_nodes(H,positions,node_color='#DA70D6',nodelist=nodelist,
                            #####the node size will be now based on its degree
                            node_size=tuple([x**3 for x in node_sizes]),alpha=0.8)
     
     #Styling for labels
     nx.draw_networkx_labels(H, positions, font_size=8, 
                             font_family='sans-serif')
     
     ###edge colors based on weight direction
     if corr_direction == "positive":
         edge_colour = plt.cm.GnBu 
     else:
         edge_colour = plt.cm.PuRd
         
     #draws the edges
     nx.draw_networkx_edges(H, positions, edges,style='solid',
                           ###adds width=weights and edge_color = weights 
                           ###so that edges are based on the weight parameter 
                           ###edge_cmap is for the color scale based on the weight
                           ### edge_vmin and edge_vmax assign the min and max weights for the width
                           width=weights, edge_color = weights, edge_cmap = edge_colour,
                           edge_vmin = min(weights), edge_vmax=max(weights))
 
     # displays the graph without axis
     plt.axis('off')
     #saves image
     plt.savefig("part5" + corr_direction + ".png", format="PNG")
     plt.show() 

We want to create a linear regression for each CSA using {X: year, Y: value} for a given indicator

import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.linear_model import LinearRegression
 
 # Create 3 columns: CSA2010	variable value
 wdf = vs10to19Ind.melt(id_vars='CSA2010', value_vars=vs10to19Ind.columns[1:])
 
 # Convert indicator labels into our X (Year) column 
 wdf['variable'] = wdf['variable'].apply(lambda x: int(x.replace('biz1_','') ) )
 
 findf = {'CSA':[], 'B':[], 'M':[] }
 # For each CSA 
 for csa in wdf.CSA2010.unique():
   CsaData = wdf[ wdf['CSA2010']==csa]
   X = CsaData[['variable']] #.values # returns: [10 11 12 13 14 15 16 17 18 19]
   y = CsaData[['value']] #.values
   regressor = LinearRegression()
   regressor.fit(X, y)
   y_pred = regressor.predict(X)
   plt.scatter(X, y, color = 'red')
   plt.plot(X, regressor.predict(X), color = 'blue')
   plt.title('biz1: '+ csa)
   plt.xlabel('YEAR')
   plt.ylabel('VALUE')
   display( plt.show() )
   display( print('B: ', regressor.coef_, 'Y: ', regressor.intercept_) ) 
   findf['CSA'].append(csa)
   findf['B'].append(regressor.intercept_[0])
   findf['M'].append(regressor.coef_[0][0])
lin_reg_dft.columns = lin_reg_dft.iloc[0]
 lin_reg_dft = lin_reg_dft[1:]
 lin_reg_dft.index.name = 'variable'
 lin_reg_dft = lin_reg_dft.astype('float64')

We may need to normalize the data for this to be useable

import matplotlib.pyplot as plt
 f = plt.figure(figsize=(19, 15))
 plt.matshow(df.corr(), fignum=f.number)
 irange = range(df.select_dtypes(['number']).shape[1])
 labels = df.select_dtypes(['number']).columns
 # plt.xticks(irange, labels, fontsize=14, rotation=45)
 plt.yticks(irange, labels, fontsize=14)
 cb = plt.colorbar()
 cb.ax.tick_params(labelsize=14)
 plt.title('Correlation Matrix', fontsize=16);
t = """ """
 !pip install nbdev
 from google.colab import drive
 drive.mount('/content/drive')
 %cd /content/drive/My Drive/'Software Development Documents'/dataplay/
 
 # !pip install dataplay