Musical loops

In graph theory, a clustering coefficient is a measure of the degree to which nodes in a graph tend to cluster together. Evidence suggests that in most real-world networks, and in particular social networks, nodes tend to create tightly knit groups characterized by a relatively high density of ties; this likelihood tends to be greater than the average probability of a tie randomly established between two nodes (Holland and Leinhardt, 1971; Watts and Strogatz, 1998).

Two versions of this measure exist: the global and the local. The global version was designed to give an overall indication of the clustering in the network, whereas the local gives an indication of the embeddedness of single nodes. - Geek for Geeks

``` # this example is from the Geek for Geeks link above
def average_clustering(G, trials=1000):
"""Estimates the average clustering coefficient of G.

The local clustering of each node in `G` is the
fraction of triangles that actually exist over
all possible triangles in its neighborhood.
The average clustering coefficient of a graph
`G` is the mean of local clusterings.

This function finds an approximate average
clustering coefficient for G by repeating `n`
times (defined in `trials`) the following
experiment: choose a node at random, choose
two of its neighbors at random, and check if
they are connected. The approximate coefficient
is the fraction of triangles found over the
number of trials _.

Parameters
----------
G : NetworkX graph

trials : integer
Number of trials to perform (default 1000).

Returns
-------
c : float
Approximated average clustering coefficient.

"""
n = len(G)
triangles = 0
nodes = G.nodes()
for i in [int(random.random() * n) for i in range(trials)]:
nbrs = list(G[nodes[i]])
if len(nbrs) < 2:
continue
u, v = random.sample(nbrs, 2)
if u in G[v]:
triangles += 1
return triangles / float(trials) ```
`cc=nx.average_clustering(G) `
```c
nx.draw(G)```

### Data Prep

```rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson')
# rdf.set_index('CSA2010', drop=True, inplace=True)
rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True)

vs10to19Ind = rdf.filter(regex='biz1|CSA2010', axis=1)```

Get only the columns we want to work with

What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.

```vs10to19Indt.columns = vs10to19Indt.iloc
vs10to19Indt = vs10to19Indt[1:]
vs10to19Indt.index.name = 'variable'
vs10to19Indt = vs10to19Indt.astype('float64')```

## a. Calculate the correlation matrix

cor_matrix contains the full correlation matrix. The table below shows a snapshot of the first 5 rows.

```cor_matrix = vs10to19Indt.iloc[:,:].corr()
#shows the first 5 rows
```import matplotlib.pyplot as plt
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
irange = range(df.select_dtypes(['number']).shape)
labels = df.select_dtypes(['number']).columns
# plt.xticks(irange, labels, fontsize=14, rotation=45)
plt.yticks(irange, labels, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);```
`lblVals = cor_matrix.index.values`
`cor_matrix = np.asmatrix(cor_matrix)`

## b. Create graph

```G = nx.from_numpy_matrix(cor_matrix)

#relabels the nodes to match the  stocks names
G = nx.relabel_nodes(G,lambda x: lblVals[x])

#Shows the first 5 edges with their corresponding edges
# OLD: G.edges(data=True)[:5]
list(G.edges(data=True))[0:5]```

## C. Styling the nodes based on the number of edges linked (degree)

```#function to create and display networks from the correlatin matrix.

def create_corr_network_5(G, corr_direction, min_correlation):
##Creates a copy of the graph
H = G.copy()

##Checks all the edges and removes some based on corr_direction
for stock1, stock2, weight in G.edges(data=True):
##if we only want to see the positive correlations we then delete the edges with weight smaller than 0
if corr_direction == "positive":
####it adds a minimum value for correlation.
####If correlation weaker than the min, then it deletes the edge
if weight["weight"] <0 or weight["weight"] < min_correlation:
H.remove_edge(stock1, stock2)
##this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0
else:
####it adds a minimum value for correlation.
####If correlation weaker than the min, then it deletes the edge
if weight["weight"] >=0 or weight["weight"] > min_correlation:
H.remove_edge(stock1, stock2)

#crates a list for edges and for the weights
edges,weights = zip(*nx.get_edge_attributes(H,'weight').items())

### increases the value of weights, so that they are more visible in the graph
weights = tuple([(1+abs(x))**2 for x in weights])

#####calculates the degree of each node
d = nx.degree(H)
#####creates list of nodes and a list their degrees that will be used later for their sizes
nodelist, node_sizes = zip(*d)

#positions
positions=nx.circular_layout(H)

#Figure size
plt.figure(figsize=(15,15))

#draws nodes
nx.draw_networkx_nodes(H,positions,node_color='#DA70D6',nodelist=nodelist,
#####the node size will be now based on its degree
node_size=tuple([x**3 for x in node_sizes]),alpha=0.8)

#Styling for labels
nx.draw_networkx_labels(H, positions, font_size=8,
font_family='sans-serif')

###edge colors based on weight direction
if corr_direction == "positive":
edge_colour = plt.cm.GnBu
else:
edge_colour = plt.cm.PuRd

#draws the edges
nx.draw_networkx_edges(H, positions, edges,style='solid',
###adds width=weights and edge_color = weights
###so that edges are based on the weight parameter
###edge_cmap is for the color scale based on the weight
### edge_vmin and edge_vmax assign the min and max weights for the width
width=weights, edge_color = weights, edge_cmap = edge_colour,
edge_vmin = min(weights), edge_vmax=max(weights))

# displays the graph without axis
plt.axis('off')
#saves image
plt.savefig("part5" + corr_direction + ".png", format="PNG")
plt.show() ```

We want to create a linear regression for each CSA using {X: year, Y: value} for a given indicator

```import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression

# Create 3 columns: CSA2010	variable value
wdf = vs10to19Ind.melt(id_vars='CSA2010', value_vars=vs10to19Ind.columns[1:])

# Convert indicator labels into our X (Year) column
wdf['variable'] = wdf['variable'].apply(lambda x: int(x.replace('biz1_','') ) )

findf = {'CSA':[], 'B':[], 'M':[] }
# For each CSA
for csa in wdf.CSA2010.unique():
X = CsaData[['variable']] #.values # returns: [10 11 12 13 14 15 16 17 18 19]
regressor = LinearRegression()
regressor.fit(X, y)
y_pred = regressor.predict(X)
plt.scatter(X, y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('biz1: '+ csa)
plt.xlabel('YEAR')
plt.ylabel('VALUE')
display( plt.show() )
display( print('B: ', regressor.coef_, 'Y: ', regressor.intercept_) )
findf['CSA'].append(csa)
findf['B'].append(regressor.intercept_)
findf['M'].append(regressor.coef_)```
```lin_reg_dft.columns = lin_reg_dft.iloc
lin_reg_dft = lin_reg_dft[1:]
lin_reg_dft.index.name = 'variable'
lin_reg_dft = lin_reg_dft.astype('float64')```

We may need to normalize the data for this to be useable

```import matplotlib.pyplot as plt
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
irange = range(df.select_dtypes(['number']).shape)
labels = df.select_dtypes(['number']).columns
# plt.xticks(irange, labels, fontsize=14, rotation=45)
plt.yticks(irange, labels, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);```
