Home 00 Datalabs 01 Scooter Explorati... 02 211 Web Scraper 03 Nbdev Create Clea... 04 Wordclouds 05 Amivi Meetup 06 Courts Mechanical... 07 Nb 2 Html Tests 08 Apis 09 Looking At Data

Don't Look! I'm changing!

URL Copied

Musical loops

Today we will run through a few experiments to work with data

We will be using a library created by bnia among others

import pandas as pd import geopandas as gpd import matplotlib.pyplot as plt import networkx as nx import warnings warnings.filterwarnings('ignore')

Lets start from where we left off last time

baseurl = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/" slug = "/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson" url = baseurl+shortname+slug gdf = gpd.read_file(url).set_index('CSA2010').drop(axis='1', columns=['OBJECTID','Shape__Area','Shape__Length']) gdf.drop(axis='1', columns=['geometry']).to_csv(shortname+'.csv', quoting=csv.QUOTE_ALL)
libcard11libcard12libcard13libcard14libcard15libcard16libcard17libcard18libcard19
CSA2010
Allendale/Irvington/S. Hilton194.672258206.326694185.546032318.616267328.975766276.006660229.882222233.397052214.527964
Beechfield/Ten Hills/West Hills153.212655153.131115140.410959249.510763261.986301225.782779178.082192175.391389167.482061
Belair-Edison319.418925310.289389261.311438443.959577463.711530401.067983337.792834343.706936333.486449
Brooklyn/Curtis Bay/Hawkins Point229.726883195.464439187.109457307.589693352.383627296.145475272.344309269.255073254.370568
Canton267.777778235.308642169.382716284.320988299.753086284.938272269.259259288.518519309.259259
test.head(1)
CSA2010indexAllendale/Irvington/S. HiltonBeechfield/Ten Hills/West HillsBelair-EdisonBrooklyn/Curtis Bay/Hawkins PointCantonCedonia/FrankfordCherry HillChinquapin Park/BelvedereClaremont/ArmisteadClifton-BereaCross-Country/CheswoldeDickeyville/FranklintownDorchester/AshburtonDowntown/Seton HillEdmondson VillageFells PointForest Park/WalbrookGlen-FallstaffGreater Charles Village/BarclayGreater GovansGreater MondawminGreater Roland Park/Poplar HillGreater RosemontGreenmount EastHamiltonHarbor East/Little ItalyHarford/EchodaleHighlandtownHoward Park/West ArlingtonInner Harbor/Federal HillLauravilleLoch RavenMadison/East EndMedfield/Hampden/Woodberry/RemingtonMidtownMidway/ColdstreamMorrell Park/VioletvilleMount Washington/ColdspringNorth Baltimore/Guilford/HomelandNorthwoodOldtown/Middle EastOrangeville/East HighlandtownPatterson Park North & EastPenn North/Reservoir HillPimlico/Arlington/HilltopPoppleton/The Terraces/Hollins MarketSandtown-Winchester/Harlem ParkSouth BaltimoreSoutheasternSouthern Park HeightsSouthwest BaltimoreThe WaverliesUpton/Druid HeightsWashington Village/PigtownWestport/Mount Winans/Lakeland
0libcard11194.672258153.212655319.418925229.726883267.777778216.283907323.579615236.719959182.723849279.52197778.256867114.362351206.261666381.01148293.037975249.253236279.114631125.050288244.402416289.954124239.219052486.512132256.295758284.946237224.427011319.585722188.550389362.068966141.175389314.352392266.601483212.788191330.805809328.329883386.08522275.24937767.157519268.962848243.987632272.066334313.641353281.458767372.671661299.338022163.083954277.624853276.517186246.175461173.801917183.604336205.870841358.957823293.656933318.916954155.499368

https://seaborn.pydata.org/examples/horizontal_boxplot.html

import matplotlib.pyplot as plt sns.set_theme(style="ticks") # Initialize the figure with a logarithmic x axis f, ax = plt.subplots(figsize=(7, 6)) ax.set_xscale("log") # Load the example planets dataset planets = sns.load_dataset("planets") # Plot the orbital period with horizontal boxes sns.boxplot(x="value", y="index", data=test, whis=[0, 100], width=.6, palette="vlag") # Add in points to show each observation sns.stripplot(x="value", y="index", data=test, size=4, color=".3", linewidth=0) # Tweak the visual presentation ax.xaxis.grid(True) ax.set(ylabel="") sns.despine(trim=True, left=True) Image Alt Text sns.set_theme(style="ticks") sns.pairplot(df.reset_index(), hue="CSA2010")Image Alt Text import numpy as np from pandas import DataFrame import seaborn as sns %matplotlib inline # We can change the size of our images like this: plt.figure(figsize=(10,10)) # And heatmaps are as simple as this: sorted_df = df.sort_values(by=['libcard19'], ascending = False) sns.heatmap(sorted_df)Image Alt TextImage Alt Text import geopandas as gpd import numpy as np import pandas as pd from branca.colormap import linear from dataplay import intaker # conditionally loaded -> from dataplay import geoms u = intaker.Intake rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson') # rdf.set_index('CSA2010', drop=True, inplace=True) rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True) ndf = rdf.filter(regex='biz1|CSA2010', axis=1) # Calculate number of years available n_periods = len(ndf.columns) - 1 # Get starting year. startAt = "20"+ndf.columns[1][-2:] # Create a 'YEAR' index with the assumption that all following years exist datetime_index = pd.date_range(startAt, periods=n_periods, freq="Y") dt_index_epochs = datetime_index.astype(int) // 10 ** 9 dt_index = dt_index_epochs.astype("U10") styledata = {} # For the Index of each CSA for idx, csa in rdf.iterrows(): df = pd.DataFrame( { "color": csa.values[1:-1] }, index=dt_index, ) styledata[idx] = df max_color, min_color = 0, 0 for country, data in styledata.items(): max_color = max(max_color, data["color"].max()) min_color = min(max_color, data["color"].min()) cmap = linear.PuRd_09.scale(min_color, max_color) def norm(x): return (x - x.min()) / (x.max() - x.min()) for country, data in styledata.items(): data["color"] = data["color"].apply(cmap) data["opacity"] = 1 styledict = { str(country): data.to_dict(orient="index") for country, data in styledata.items() } # { CSA : { timestamp: {color: value, opacity:value } }, # CSA : { timestamp: {color: value, opacity:value } }, # ... # } import folium from folium.plugins import TimeSliderChoropleth m = folium.Map([39.28759453969165, -76.61278931706487], width='75%', height='75%', zoom_start=12) g = TimeSliderChoropleth( rdf.to_json(), styledict=styledict, ).add_to(m) m.save(outfile= "test.html") m Output hidden; open in https://colab.research.google.com to view. rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson') # rdf.set_index('CSA2010', drop=True, inplace=True) rdf.head() rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True) rdf.sort_values(by=['biz1_19'], ascending = False, inplace=True) vs10to19Ind = rdf.filter(regex='biz1|CSA2010', axis=1) gothere https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson gothere1

What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.

What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.

vs10to19Indt.columns = vs10to19Indt.iloc[0] vs10to19Indt = vs10to19Indt[1:] vs10to19Indt.index.name = 'variable' vs10to19Indt = vs10to19Indt.astype('float64') cor_matrix = vs10to19Indt.iloc[:,:].corr() #shows the first 5 rows cor_matrix.head(5)
CSA2010Oldtown/Middle EastLoch RavenMount Washington/ColdspringGreater Charles Village/BarclayDorchester/AshburtonLauravilleOrangeville/East HighlandtownCherry HillGreater GovansEdmondson VillageChinquapin Park/BelvedereBelair-EdisonPimlico/Arlington/HilltopDowntown/Seton HillGreater Roland Park/Poplar HillNorth Baltimore/Guilford/HomelandInner Harbor/Federal HillGreater RosemontMorrell Park/VioletvilleMedfield/Hampden/Woodberry/RemingtonPatterson Park North & EastCedonia/FrankfordHarford/EchodaleHarbor East/Little ItalyMidtownHoward Park/West ArlingtonNorthwoodSouth BaltimoreWashington Village/PigtownSouthern Park HeightsBeechfield/Ten Hills/West HillsHamiltonSouthwest BaltimoreCross-Country/CheswoldeAllendale/Irvington/S. HiltonGreater MondawminGlen-FallstaffFells PointSoutheasternPoppleton/The Terraces/Hollins MarketCantonBrooklyn/Curtis Bay/Hawkins PointWestport/Mount Winans/LakelandForest Park/WalbrookPenn North/Reservoir HillSandtown-Winchester/Harlem ParkHighlandtownClaremont/ArmisteadDickeyville/FranklintownUpton/Druid HeightsClifton-BereaMadison/East EndThe WaverliesMidway/ColdstreamGreenmount East
CSA2010
Oldtown/Middle East1.0000000.8561430.7739100.088855-0.608827-0.4995270.8146920.4038810.2511160.4949360.199884-0.1682260.6783660.6903500.5280170.295356-0.064447-0.6290570.3043080.618626-0.388397-0.652071-0.082405-0.421950-0.185125-0.061869-0.172368-0.198497-0.440238-0.589253-0.500684-0.208732-0.5979280.168858-0.541845-0.578843-0.538057-0.573336-0.146943-0.512577-0.213811-0.252186-0.653031-0.525645-0.665015-0.481733-0.309692-0.5508630.265893-0.500873-0.681230-0.801715-0.578070-0.494797-0.267336
Loch Raven0.8561431.0000000.7638490.099961-0.418352-0.2603450.6119990.6272700.3238620.3858950.043460-0.0617500.3985260.6163270.3410740.376594-0.060724-0.4021430.3168090.521855-0.359277-0.502731-0.187982-0.524136-0.3961050.201605-0.268888-0.413832-0.504003-0.629982-0.130710-0.343178-0.5788360.290089-0.381243-0.510632-0.621275-0.610172-0.193038-0.659241-0.385994-0.232488-0.373138-0.578257-0.448557-0.573299-0.495425-0.710760-0.021265-0.492105-0.522877-0.790195-0.625502-0.642093-0.544531
Mount Washington/Coldspring0.7739100.7638491.0000000.230172-0.298009-0.0618550.7201580.448142-0.1472210.3455490.1913830.0641900.5874200.6411320.5049730.4537590.054264-0.5432990.2925590.729633-0.342624-0.448081-0.115320-0.337317-0.117033-0.119977-0.045260-0.460318-0.240693-0.591640-0.451999-0.228413-0.4714390.196364-0.302116-0.362125-0.155026-0.556791-0.029925-0.600164-0.235837-0.469413-0.424707-0.567754-0.711652-0.537917-0.194337-0.3728150.149830-0.581333-0.889449-0.505319-0.424233-0.428873-0.260162
Greater Charles Village/Barclay0.0888550.0999610.2301721.0000000.6583060.6655910.1885990.1746700.3292200.4386930.6632980.812766-0.2559500.5765180.5704190.3574860.7983950.349247-0.0421380.0962130.7267420.6150490.4401040.7180920.656386-0.3387720.4437200.5483000.5442890.5383340.3214350.5974350.5089490.2467470.5896880.6309920.2611270.495503-0.0712470.1893000.6440590.2174430.1638020.2549080.2014300.5935430.5291420.4674510.0339320.358678-0.1721680.1063810.1841520.2671920.520408
Dorchester/Ashburton-0.608827-0.418352-0.2980090.6583061.0000000.868057-0.449121-0.1802660.071918-0.0670830.2936270.578843-0.663439-0.1727990.0954960.0480370.5952470.827268-0.364005-0.3629250.8059200.8881070.1604640.7135370.622471-0.1765380.3131090.4758310.8145340.7350800.5500780.5393970.612604-0.0234560.8356690.8277550.5420150.713982-0.0617360.4615620.4700960.2635710.7337620.4447070.5701760.7271400.5479600.568392-0.2084970.5159620.3590930.5924830.3506400.5045600.442400
import matplotlib.pyplot as plt f = plt.figure(figsize=(19, 15)) plt.matshow(df.corr(), fignum=f.number) irange = range(df.select_dtypes(['number']).shape[1]) labels = df.select_dtypes(['number']).columns # plt.xticks(irange, labels, fontsize=14, rotation=45) plt.yticks(irange, labels, fontsize=14) cb = plt.colorbar() cb.ax.tick_params(labelsize=14) plt.title('Correlation Matrix', fontsize=16);