#hide %%capture !pip install nbdev from nbdev import * ! nbdev_upgrade
Introduction
This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.
Dataset: Scooter data:
- Routes: 3 months (September to August 2019)
- Deployment/
- Routes
- Trip origins-destinations by month
Local File Access (Optional)
# (Optional) Run this cell to gain access to Google Drive (Colabs only) from google.colab import drive # Colabs operates in a virtualized enviornment # Colabs default directory is at ~/content. # We mount Drive into a temporary folder at '~/content/drive' drive.mount('/content/drive')File path's will vary
# cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIA ! ls !cd ../ && lsInstalls
Imports
Convenience Functions
def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol): count = 0 total = pts.size / len(pts.columns) # We're going to keep a list of how many points we find. pts_in_polys = [] # Loop over polygons with index i. for i, poly in polygons.iterrows(): # print('Searching for point within Geom:', poly ) # Keep a list of points in this poly pts_in_this_poly = [] # Now loop over all points with index j. for j, pt in pts.iterrows(): if poly[polygonsCoordCol].contains(pt[ptsCoordCol]): # Then it's a hit! Add it to the list, # and drop it so we have less hunting. pts_in_this_poly.append(pt[ptsCoordCol]) pts = pts.drop([j]) # We could do all sorts, like grab a property of the # points, but let's just append the number of them. pts_in_polys.append(len(pts_in_this_poly)) print('Found this many points within the Geom:', len(pts_in_this_poly) ) count = count + len(pts_in_this_poly) clear_output(wait=True) # Add the number of points for each poly to the dataframe. polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys) print( 'Total Points: ', total ) print( 'Total Points in Polygons: ', count ) print( 'Prcnt Points in Polygons: ', count / total ) return polygonsFile Access Convenience Functions
def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))Inspect Deployment Data
#@markdown Forms support many types of fields. fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson'] #@markdown --- gdf.head() point_df['centroids'] = df.centroid point_df = point_df.drop(columns = 'geometry') point_df = point_df.set_geometry('centroids') point_df.head(1) point_df.plot(marker='o', color='red', markersize='value') polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010') pointsInPolys.plot(column='value', legend=True, markersize = 1) gdf.shape #DataFrame.size Return an int representing the number of elements in this object. gdf.size # DataFrame.ndim Return an integer representing the number of axes/array dimensions. gdf.ndim # Note Used : # DataFrame.axes Return a list representing the axes of the DataFrame. gdf.dtypes # Return unbiased kurtosis over requested axis using Fisherβs definition of kurtosis (kurtosis of normal == 0.0). gdf.kurtosis()Load Tracts
pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" pd.set_option('max_colwidth', 20) in_crs = 2248 # The CRS we recieve our data. out_crs = 4326 # The CRS we would like to have our data represented as. geom = 'geometry' # The column where our spatial information lives. # A Url to load boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv" # Read in the dataframe gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010) # Convert the geometry column datatype from a string of text into a coordinate datatype. gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) )) # Process the dataframe as a geodataframe with a known CRS and geom column. gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')Ensure merge is on consistent datatypes
scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)Perform the merge
# gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)Inspect Routes Data
columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent'] gdf = gpd.read_file( findFile('./', fileName) ) gdf.head() stroke='white', strokeWidth=2 ).encode( color=alt.value('#eee'), ).properties( width=700, height=500 )""" # GeoDataFrame could be passed as usual pd.DataFrame city = alt.Chart(baltimore).mark_geoshape( ).project( ).encode( color='tpop10', # shorthand infer types as for regular pd.DataFrame tooltip='CSA2010' # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) city + routesClean the gdf of empties.
gdf = gdf[~gdf.is_empty]Now get the extremities; this will take a minute.
gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))Split the gdf into a left and right dataset
gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty']) gdf_right= gdf.copy() gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])The coordinate variables of object type will cause problems.
Let's go ahead and coerce a correction.
gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce') gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce') gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')Now we can view the results
Save these csv's because it took a minute to get to where we are now.
gdf_left.to_csv('leftRouts.csv')Convert the datasets to geodataframes for further exploration!
#temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) ) #temp_gdf.head() # Alternately this could work.. unfinished.. but wkt.loads can make a Point from text # gdf_right['strCol']=gdf_right['rightx'].astype(str) # gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads) left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268) right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268) right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry') import geopandas as gpd import gpdvega # GeoDataFrame could be passed as usual pd.DataFrame chart = alt.Chart(right_points_in_poly).mark_geoshape( ).project( ).encode( color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id ).properties( width=500, height=300 ) routes = alt.Chart(gdf).mark_geoshape( filled=False, strokeWidth=2 ) chart + routes chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'}) chart.save(fileName[:-8]+'.json')