â ī¸ For best experience open in colab!.â ī¸
Selenium vs Puppetteer vs jsdom vs casper.ipynb
casper is missing. Very easy to use.
JSdom
!npm install jsdom
%%writefile testfile.js const jsdom = require("jsdom"); const { JSDOM } = jsdom; JSDOM.fromURL('https://bniajfi.org', {resources: "usable"}).then(dom => { txt = dom.serialize() txt=txt.split('document.getElementById("container")')[1] console.log( txt ) });
%%writefile bnia_trends.js const jsdom = require("jsdom"); const { JSDOM } = jsdom; (async () => { links.map( link =>{ var url = 'https://bniajfi.org' console.log(url) JSDOM.fromURL(url, {resources: "usable"}).then(dom => { txt = dom.serialize() txt=txt.split('document.getElementById("container")')[0] console.log( txt ) }); } ) } )
Puppetteer
Headless Recorder Extension
Now lets write the js script and stuff it into an index.js file
# !npm install papaparse ! npm i -s csv-parser
!npm i -s csv-writer
! npm i async-csv
!echo "" >data.csv
! npm install html-table-to-json
! pip install requests_html
%%writefile bniatest.js 'use strict'; // Load our tool const HtmlTableToJson = require('html-table-to-json'); const csv = require('csv-parser'); const fs = require('fs'); const puppeteer = require('puppeteer'); (async () => { // Setup the browser var browser = await puppeteer.launch({ executablePath:"/usr/lib/chromium-browser/chromium-browser", args:['--no-sandbox', `--window-size=1200,1000`], headless: false, // The browser is visible ignoreHTTPSErrors: true, }); const page = await browser.newPage() async function getTopWallets(page, num){ // Visit our page // Configure the page view. await page.goto(`https://bniajfi.org`) await page.setViewport({ width: 800, height: 70000 }) await page.waitForTimeout(500); var tblz = await page.evaluate( () => { function getTblData(tblid){ if(tblid=='#tblOne'){ Array.prototype.forEach.call( document.querySelectorAll(tblid+ " > thead"), function( node ) { var el = document.createElement("thead"); el.innerHTML = header node.parentNode.insertBefore(el, node.nextSibling); node.parentNode.removeChild( node ); return Array.from( document.querySelectorAll(tblid), element => element.outerHTML ) }); } Array.prototype.forEach.call( document.querySelectorAll(tblid +" > tbody > tr > td > span"), function( node ) { node.parentNode.removeChild( node ); }); Array.prototype.forEach.call( document.querySelectorAll(tblid +" > tbody > tr > td > small"), function( node ) { node.parentNode.removeChild( node ); }); Array.prototype.forEach.call( document.querySelectorAll(tblid +" > tbody > tr > td:nth-child(3)"), function( node ) { node.innerHTML = node.innerHTML; }); var output = Array.from( document.querySelectorAll(tblid), element => element.outerHTML ) var position = 121 if(tblid=='#tblOne2'){ output = [ [output[0].slice(0, position), header, output[0].slice(position)].join('') ] } return output } return [...getTblData('#tblOne'), ...getTblData('#tblOne2') ] } ) return [...HtmlTableToJson.parse( tblz[0] ).results, ...HtmlTableToJson.parse( tblz[1] ).results ].flat() } var data = [] for (var i = 1; i < 2; i++) { data = [...data, ...await getTopWallets(page, i==1?'':('-'+i) ) ] } csvWriter.writeRecords(data).then(()=> console.log('The CSV file was written')); await browser.close() })()
!node bniatest.js
Selenium
!sudo apt install unzip !wget https://chromedriver.storage.googleapis.com/2.37/chromedriver_linux64.zip #!unzip chromedriver_linux64.zip -d /usr/bin/ !pip install selenium !pip install openpyxl !apt-get update !apt-get install -y unzip xvfb libxi6 libgconf-2-4 !apt-get install default-jdk
! apt install chromium-chromedriver
from google.colab import drive
import os, subprocess import sys import selenium import bs4 from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException from bs4 import BeautifulSoup import openpyxl import time, re, csv, urllib.parse import pandas as pd
options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-gpu') pd.set_option('display.max_columns', 20) pd.set_option('display.max_colwidth', 200) mobile_emulation = { "deviceName": "iPhone X" }
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
driver = webdriver.Chrome('chromedriver', chrome_options=options)
driver.get('https://bniajfi.org') time.sleep(.5) driver.save_screenshot('screenie.png') with open("./page_source.html", "w") as f: f.write(driver.page_source)
driver.find_elements_by_css_selector('.container')
driver.find_element_by_tag_name('html').get_attribute('innerHTML')
time.sleep(.5) login = driver.find_element_by_xpath('//*[@id="container"]/div/div/') login.click() #username username = driver.find_element_by_xpath('//*[@id="container"]/div/div/') username.click() username.send_keys('user') #password password = driver.find_element_by_xpath('//*[@id="container"]/div/div/') password.click() password.send_keys('pass') #sign in signin = driver.find_element_by_xpath('//*[@id="container"]/div/div/') signin.click()
import re import os import time import random import requests import numpy as np import pandas as pd from os import system from math import floor from copy import deepcopy from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time from selenium.webdriver.chrome.options import Options import matplotlib.pyplot as plt
# import scraping modules from selenium import webdriver import pandas as pd # open browser driver = webdriver.Chrome() # scrape vital signs driver.get("https://medicalprogress.dev/patient_file2/vit_signs.html") html = driver.page_source driver.close() data = pd.read_html(html) data = data[0] data = pd.DataFrame(data) print(data)
# replace all comments behind numbers with nothing data[data.columns[1]] = data[data.columns[1]].str.replace(r'[a-zA-Z].*', '', regex=True)