Home 00 Datalabs 01 Scooter Explorati... 02 211 Web Scraper03 Nbdev Create Clea... 04 Wordclouds 05 Amivi Meetup 06 Courts Mechanical... 07 Nb 2 Html Tests 08 Apis 09 Looking At Data

Don't Look! I'm changing!

URL Copied

Musical loops

BinderBinderBinderOpen Source Love svg3

NPM LicenseActiveGitHub last commit

GitHub starsGitHub watchersGitHub forksGitHub followers

TweetTwitter Follow

We will need these libraries

url = 'https://md-dc.211counts.org/dashBoard/barChart' # And these are the paramaters that get submitted to the endpoint as a POST request myobj = { 'identifierCategory': '', 'sourceType': '', 'fromMobile': False, 'id': {"ids":[14160]}, 'timeIntervalId': 333, 'centerId': 43, 'fromDate': 0, 'toDate': 0, 'type': 'Z' }

Miner

(Works, but html content needs to be scraped seperately, through a same-origin request)

# This needs additional scraping, but now each record has its respective html is isolated. def scrape(html_data): soup = BeautifulSoup(html_data, "html.parser") categories = soup.findAll("div", {"class": "categories"}) return categories df = pd.DataFrame(data={'zipCodeId': [], 'title': [], 'value': []}) formValues = htmlText # Loop through array of html containing our data for x in scrape(formValues): soup = BeautifulSoup(str(x), "html.parser") # get the title and value from the records html title = soup.find("span", {"class": "toolTipSubCategory"}) value = soup.find("span", {"data-value": True}) # append it to our dataframe if both value and title exit if(str(title) != 'None' and str(value) != 'None' ): title = title.text value = value['data-value'] df = df.append({'zipCodeId': str(14174), 'title': title, 'value': value}, ignore_index=True) htmlText = '' # This needs additional scraping, but now each record has its respective html is isolated. def scrape(html_data): soup = BeautifulSoup(html_data, "html.parser") categories = soup.findAll("div", {"class": "categories"}) return categories

Scraper

(Works but doesnt give us what we want)

Get the page

browser.open("https://md-dc.211counts.org/") browser.get_url()

Find the form and select our options

browser['chkZ'] = '14160' response = browser.submit_selected()

Save the output for previewing

n = text_file.write(response.text) text_file.close() # print(response.text)

Unfortunately, this does not work!

Our data is stored as dynamic content that is not delivered to us here.

We will need to use Pupeteer to render the dynamic javascript content.

What a shame...

Final Solution (Javascript)

This needs to be ran in a seperate js file.

* https://stackoverflow.com/questions/48681145/set-pupeteer-window-size-when-running-not-headless-not-viewport https://medium.com/@aslushnikov/automating-clicks-in-chromium-a50e7f01d3fb * */ const puppeteer = require('puppeteer'); var zipCodes = { '21201':11175, '21202':11176, '21203':14550, '21204':11177, '21205':11178, '21206':11179, '21207':11180, '21208':11181, '21209':11182, '21210':11183, '21211':11184, '21212':11185, '21213':11373, '21214':11375, '21215':11376, '21216':11377, '21217':11378, '21218':11379, '21219':11380, '21220':11381, '21221':11382, '21222':11383, '21223':11384, '21224':11385, '21225':11386, '21226':11387, '21227':11388, '21228':11389, '21229':11390, '21230':11391, '21231':11392, }; let nomatch = [ '21232', '14550', '11177', '11178', '11178', '11189' ] var finalData = {}; (async () => { // Init url to scrape, load browser and page const browser = await puppeteer.launch({ headless: false, // The browser is visible ignoreHTTPSErrors: true, args: [`--window-size=1200,1000`] // new option }) const page = await browser.newPage() await page.goto('https://md-dc.211counts.org/') await page.setViewport({ width: 1200, height: 1000 }) await page.waitForSelector('#mainContent #identifierCategory') // Click Covid-19 Checkbox await page.click('#mainContent #identifierCategory') await page.waitForSelector('#mainContent #displayCount') // Click Radio Button await page.click('#mainContent #displayCount') // Click to Expand the Timestamp Dropdown await page.mouse.move(713, 37); await page.mouse.down({button: 'left'}); await page.mouse.up({button: 'left'}); // Click from the dropdwn '30 Days' await page.mouse.move(688, 194); await page.mouse.down({button: 'left'}); await page.mouse.up({button: 'left'}); // Search for every zip code and add the result to an object for (var j = 0; j < Object.keys(zipCodes).length; j++) { var currentZipCode = Object.keys(zipCodes)[j] var zid = zipCodes[currentZipCode] console.log("CurZip: " + currentZipCode, ' ID: ', 'input[value="'+zid+'"]'); await page.evaluate((zc) => { // Prepare the zip code checkbox identifier to click on let reset = 'input[value="RESET"]'; $(reset).click(); let zipCodeCheckBox = 'input[value="' + zc + '"]'; $(zipCodeCheckBox).click(); // Get search button, click $("#submitSearch").click(); }, zid); // And wait for the results to load await page.waitFor(1000); await page.screenshot({path: currentZipCode+'.png'}); await page.evaluate((zc) => { // Click on count radio button let countsBtn = document.getElementById('displayCount'); countsBtn.click(); // Prepare the zip code checkbox identifier to click on let zipCodeCheckBox = 'input[value="' + zc + '"]'; $(zipCodeCheckBox).click(); // Get search button, click $("#submitSearch").click(); }, currentZipCode); // And wait for the results to load await page.waitFor(1000); await page.screenshot({path: currentZipCode+'.png'}); // Then we grab the results let result = await page.evaluate(async () => { // Get all categories div let categories = document.querySelectorAll(".categories"); // Iterate through all categories and add their values to an object let pointer = 0; let data = {} while (pointer < categories.length) { // Now we have access to two categories let category = categories[pointer]; // Making sure we have both a label and a value and category is not null if (category != null && category != undefined && category.children.length > 1) { let label = ""; let value = ""; let percentage = ""; for (var i = 0; i < category.children.length; i++) { let node = category.children[i]; // Make sure that the node is either a label or a paragraph, can't rely on indices alone if (node.nodeName.toLowerCase() === 'label') { label = node.querySelector("span").innerText; } if (node.nodeName.toLowerCase() === 'p') { value = node.querySelector("span").getAttribute("data-value"); percentage = node.querySelector("span").getAttribute("data-percentage"); } } data[label + " Count"] = value; data[label + " Percent"] = percentage; } pointer += 1; } return data; }); // Now we add this zip code's result to the final answer finalData[currentZipCode.toString()] = result; } console.log(JSON.stringify(finalData)); await page.waitFor(3000); // await browser.close() })()