Home 00 Labs 01 Nb 2 Html Tests 02 211 Web Scraper 03 Scooter Explorati... 04 Wordclouds 05 Amivi Meetup 08 Apis 09 Looking At Data 10 Satellltes Txt2speach

Don't Look! I'm changing!

URL Copied

Introduction

BinderBinderBinderOpen Source Love svg3

NPM LicenseActiveGitHub last commit

GitHub starsGitHub watchersGitHub forksGitHub followers

We will need these libraries

import requests
import pandas as pd
import mechanicalsoup
from bs4 import BeautifulSoup
# This is the URL Endpoing that has our data: 
 url = 'https://md-dc.211counts.org/dashBoard/barChart'
 # And these are the paramaters that get submitted to the endpoint as a POST request
 myobj = {
 'identifierCategory': '',
 'sourceType': '',
 'fromMobile': False,
 'id': {"ids":[14160]},
 'timeIntervalId': 333,
 'centerId': 43,
 'fromDate': 0,
 'toDate': 0,
 'type': 'Z'
 }

Miner

(Works, but html content needs to be scraped seperately, through a same-origin request)
# Given the 211 html page, will return a div containing data for each record. 
 # This needs additional scraping, but now each record has its respective html is isolated.
 def scrape(html_data):
     soup = BeautifulSoup(html_data, "html.parser")
     categories = soup.findAll("div", {"class": "categories"})
     return categories
# This will mine the scraped html pulled from Puppeteer 
 df = pd.DataFrame(data={'zipCodeId': [], 'title': [], 'value': []})
 formValues = htmlText
 # Loop through array of html containing our data
 for x in scrape(formValues):
   soup = BeautifulSoup(str(x), "html.parser")
   # get the title and value from the records html
   title = soup.find("span", {"class": "toolTipSubCategory"})
   value = soup.find("span", {"data-value": True})
   # append it to our dataframe if both value and title exit
   if(str(title) != 'None' and str(value) != 'None' ): 
     title = title.text
     value = value['data-value']
     df = df.append({'zipCodeId': str(14174), 'title': title, 'value': value}, ignore_index=True)
df
df.to_csv('test123.csv', index=False)
# A placeholder until we get the puppeteer scraper working. 
 htmlText = ''
# Given the 211 html page, will return a div containing data for each record. 
 # This needs additional scraping, but now each record has its respective html is isolated.
 def scrape(html_data):
     soup = BeautifulSoup(html_data, "html.parser")
     categories = soup.findAll("div", {"class": "categories"})
     return categories
df
df.to_csv('test123.csv', index=False)

Scraper

(Works but doesnt give us what we want)

Get the page

browser = mechanicalsoup.StatefulBrowser()
 browser.open("https://md-dc.211counts.org/")
 browser.get_url()

Find the form and select our options

browser.select_form('div[id="search"]')
 browser['chkZ'] = '14160'
# browser.get_current_form().print_summary()
 response = browser.submit_selected()

Save the output for previewing

text_file = open("sample.txt", "w")
 n = text_file.write(response.text)
 text_file.close()
 # print(response.text)

Unfortunately, this does not work!

Our data is stored as dynamic content that is not delivered to us here.

We will need to use Pupeteer to render the dynamic javascript content.

What a shame...

Final Solution (Javascript)

This needs to be ran in a seperate js file.

/*
  * https://stackoverflow.com/questions/48681145/set-pupeteer-window-size-when-running-not-headless-not-viewport
 https://medium.com/@aslushnikov/automating-clicks-in-chromium-a50e7f01d3fb
 * 
 */
 const puppeteer = require('puppeteer');
 var zipCodes = { 
   '21201':11175, 
   '21202':11176, 
   '21203':14550, 
   '21204':11177, 
   '21205':11178, 
   '21206':11179, 
   '21207':11180, 
   '21208':11181,
   '21209':11182, 
   '21210':11183, 
   '21211':11184, 
   '21212':11185, 
   '21213':11373, 
   '21214':11375, 
   '21215':11376, 
   '21216':11377,
   '21217':11378, 
   '21218':11379, 
   '21219':11380, 
   '21220':11381, 
   '21221':11382, 
   '21222':11383, 
   '21223':11384, 
   '21224':11385, 
   '21225':11386, 
   '21226':11387, 
   '21227':11388, 
   '21228':11389, 
   '21229':11390, 
   '21230':11391,
   '21231':11392, 
 };
 let nomatch = [ 
   '21232', 
   '14550', 
   '11177', 
   '11178', 
   '11178', 
   '11189'
 ]
 
 var finalData = {};
 (async () => {
   // Init url to scrape, load browser and page
   const browser = await puppeteer.launch({
     headless: false, // The browser is visible
     ignoreHTTPSErrors: true,
     args: [`--window-size=1200,1000`] // new option
   })
                             
   const page = await browser.newPage()
   await page.goto('https://md-dc.211counts.org/')
   await page.setViewport({ width: 1200, height: 1000 })
   await page.waitForSelector('#mainContent #identifierCategory')
   // Click Covid-19 Checkbox
   await page.click('#mainContent #identifierCategory') 
   await page.waitForSelector('#mainContent #displayCount')
   // Click Radio Button
   await page.click('#mainContent #displayCount')
   // Click to Expand the Timestamp Dropdown
   await page.mouse.move(713, 37);
   await page.mouse.down({button: 'left'});
   await page.mouse.up({button: 'left'});
   // Click from the dropdwn '30 Days'
   await page.mouse.move(688, 194);
   await page.mouse.down({button: 'left'});
   await page.mouse.up({button: 'left'}); 
   // Search for every zip code and add the result to an object
   for (var j = 0; j < Object.keys(zipCodes).length; j++) {
 	var currentZipCode = Object.keys(zipCodes)[j]
 	var zid = zipCodes[currentZipCode]	 
 	console.log("CurZip: " + currentZipCode, ' ID: ', 'input[value="'+zid+'"]');  
 	await page.evaluate((zc) => {	
 	  // Prepare the zip code checkbox identifier to click on
 	  let reset = 'input[value="RESET"]';
 	  $(reset).click();
 	  let zipCodeCheckBox = 'input[value="' + zc  + '"]';
 	  $(zipCodeCheckBox).click();
 	  // Get search button, click
       $("#submitSearch").click();
 	}, zid);
 	
 	// And wait for the results to load
 	await page.waitFor(1000);
 	
 	await page.screenshot({path: currentZipCode+'.png'});
 	
 	await page.evaluate((zc) => {
 		// Click on count radio button 
 		let countsBtn = document.getElementById('displayCount');
 		countsBtn.click();
 		// Prepare the zip code checkbox identifier to click on
 		let zipCodeCheckBox = 'input[value="' + zc  + '"]';
 		$(zipCodeCheckBox).click();
 		// Get search button, click
 		$("#submitSearch").click();
 	}, currentZipCode);
 	// And wait for the results to load
 	await page.waitFor(1000);
 	await page.screenshot({path: currentZipCode+'.png'});
 		// Then we grab the results
 	let result = await page.evaluate(async () => {
       // Get all categories div
 	  let categories = document.querySelectorAll(".categories");
 	  // Iterate through all categories and add their values to an object
 	  let pointer = 0; 
 	  let data = {}
 	  while (pointer < categories.length) {
 		// Now we have access to two categories
 		let category = categories[pointer];
 		// Making sure we have both a label and a value and category is not null
 		if (category != null && category != undefined && category.children.length > 1) {
 		  let label = ""; 
 		  let value = "";
 		  let percentage = "";
 		  for (var i = 0; i < category.children.length; i++) {
 			let node = category.children[i];
 			// Make sure that the node is either a label or a paragraph, can't rely on indices alone
 			if (node.nodeName.toLowerCase() === 'label') {
 			  label = node.querySelector("span").innerText;
 			}
 			if (node.nodeName.toLowerCase() === 'p') {
 			  value = node.querySelector("span").getAttribute("data-value");
 			  percentage = node.querySelector("span").getAttribute("data-percentage");
 		    }  
 		  }
 		  data[label + " Count"] = value;
 		  data[label + " Percent"] = percentage;
 		}
 		pointer += 1;
 	  }
 	  return data;
 	});
 	// Now we add this zip code's result to the final answer
 	finalData[currentZipCode.toString()] = result;
   }
   console.log(JSON.stringify(finalData));
 
   await page.waitFor(3000); 
   
   
   // await browser.close()
 })()