Home Software 01 Github 02 Colabs 03 Shell Basics 04 Apis 05 Webscraping 06 Nbdev 07 Javascript Packag... 08 Cloud Functions 09 Browser Extension Css Css Animations Javascript Llm Paradigms Protocols Websites

Don't Look! I'm changing!

URL Copied

BinderBinderBinderOpen Source Love svg3

NPM LicenseActiveGitHub last commit

GitHub starsGitHub watchersGitHub forksGitHub followers

TweetTwitter Follow

âš ī¸ For best experience open in colab!.âš ī¸

Selenium vs Puppetteer vs jsdom vs casper.ipynb

casper is missing. Very easy to use.

JSdom

!npm install jsdom
%%writefile testfile.js
 const jsdom = require("jsdom");
 const { JSDOM } = jsdom;
 JSDOM.fromURL('https://bniajfi.org', {resources: "usable"}).then(dom => {
   txt = dom.serialize()
   txt=txt.split('document.getElementById("container")')[1]
   console.log( txt )
 });
%%writefile bnia_trends.js
 const jsdom = require("jsdom");
 const { JSDOM } = jsdom;
 
 (async () => { 
     
   links.map( link =>{
       var url = 'https://bniajfi.org'
       console.log(url)
       JSDOM.fromURL(url, {resources: "usable"}).then(dom => {
       txt = dom.serialize()
       txt=txt.split('document.getElementById("container")')[0] 
       console.log( txt )
     });
   } )
 } )

Puppetteer

Headless Recorder Extension

Now lets write the js script and stuff it into an index.js file

# !npm install papaparse
 ! npm i -s csv-parser
!npm i -s csv-writer
! npm i async-csv
!echo "" >data.csv
! npm install html-table-to-json
! pip install requests_html
%%writefile bniatest.js
 'use strict';
 
 // Load our tool
 const HtmlTableToJson = require('html-table-to-json');
 const csv = require('csv-parser');
 const fs = require('fs');
 const puppeteer = require('puppeteer');
 
 (async () => {
   // Setup the browser
   var browser = await puppeteer.launch({
     executablePath:"/usr/lib/chromium-browser/chromium-browser", 
     args:['--no-sandbox', `--window-size=1200,1000`],
     headless: false, // The browser is visible
     ignoreHTTPSErrors: true,
   });
 
   const page = await browser.newPage()
   async function getTopWallets(page, num){
     // Visit our page // Configure the page view.
     await page.goto(`https://bniajfi.org`)
     await page.setViewport({ width: 800, height: 70000 })
     await page.waitForTimeout(500);
     var tblz = await page.evaluate( () => {
         function getTblData(tblid){
           if(tblid=='#tblOne'){
             Array.prototype.forEach.call( document.querySelectorAll(tblid+ " > thead"), function( node ) {
                 var el = document.createElement("thead");  el.innerHTML = header
                 node.parentNode.insertBefore(el, node.nextSibling); node.parentNode.removeChild( node );
                 return Array.from( document.querySelectorAll(tblid), element => element.outerHTML )
             });
           }
           Array.prototype.forEach.call( document.querySelectorAll(tblid +" > tbody > tr > td > span"), function( node ) { node.parentNode.removeChild( node ); });
           Array.prototype.forEach.call( document.querySelectorAll(tblid +" > tbody > tr > td > small"), function( node ) { node.parentNode.removeChild( node ); });
           Array.prototype.forEach.call( document.querySelectorAll(tblid +" > tbody > tr > td:nth-child(3)"), function( node ) { node.innerHTML = node.innerHTML; });
           var output =  Array.from( document.querySelectorAll(tblid), element => element.outerHTML )
           var position =  121
           if(tblid=='#tblOne2'){ output = [ [output[0].slice(0, position), header, output[0].slice(position)].join('') ] }
           return output
         }
         return [...getTblData('#tblOne'), ...getTblData('#tblOne2') ]
     } )
     return [...HtmlTableToJson.parse( tblz[0] ).results, ...HtmlTableToJson.parse( tblz[1] ).results ].flat()
   }
   var data = []
   for (var i = 1; i < 2; i++) {
     data = [...data, ...await getTopWallets(page, i==1?'':('-'+i) ) ]
   } 
 
   csvWriter.writeRecords(data).then(()=> console.log('The CSV file was written'));
 
   await browser.close()
 })()
!node bniatest.js

Selenium

!sudo apt install unzip
 !wget https://chromedriver.storage.googleapis.com/2.37/chromedriver_linux64.zip
 #!unzip chromedriver_linux64.zip -d /usr/bin/
 !pip install selenium
 !pip install openpyxl
 !apt-get update
 !apt-get install -y unzip xvfb libxi6 libgconf-2-4
 !apt-get install default-jdk 
! apt install chromium-chromedriver
from google.colab import drive
import os, subprocess
 import sys
 import selenium
 import bs4
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import TimeoutException
 from bs4 import BeautifulSoup
 import openpyxl
 import time, re, csv, urllib.parse
 import pandas as pd
options = Options()
 options.add_argument('--headless')
 options.add_argument('--no-sandbox')
 options.add_argument('--disable-dev-shm-usage')
 options.add_argument('--disable-gpu')
 
 pd.set_option('display.max_columns', 20)
 pd.set_option('display.max_colwidth', 200)
 mobile_emulation = { "deviceName": "iPhone X" }
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
driver = webdriver.Chrome('chromedriver', chrome_options=options)
driver.get('https://bniajfi.org')
 time.sleep(.5)
 driver.save_screenshot('screenie.png')
 
 with open("./page_source.html", "w") as f:
     f.write(driver.page_source)
driver.find_elements_by_css_selector('.container')
driver.find_element_by_tag_name('html').get_attribute('innerHTML')
time.sleep(.5)
 login = driver.find_element_by_xpath('//*[@id="container"]/div/div/')
 login.click()
 
 #username
 username = driver.find_element_by_xpath('//*[@id="container"]/div/div/')
 username.click()
 username.send_keys('user')
 
 #password
 password = driver.find_element_by_xpath('//*[@id="container"]/div/div/')
 password.click()
 password.send_keys('pass')
 
 #sign in 
 signin = driver.find_element_by_xpath('//*[@id="container"]/div/div/')
 signin.click()
import re
 import os
 import time
 import random
 import requests
 import numpy as np
 import pandas as pd
 from os import system
 from math import floor
 from copy import deepcopy
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 
 import time
 from selenium.webdriver.chrome.options import Options
 
 import matplotlib.pyplot as plt
# import scraping modules
 from selenium import webdriver
 import pandas as pd
 
 # open browser
 driver = webdriver.Chrome()
 
 # scrape vital signs
 driver.get("https://medicalprogress.dev/patient_file2/vit_signs.html")
 html = driver.page_source
 driver.close()
 data = pd.read_html(html)
 data = data[0]
 data = pd.DataFrame(data)
 print(data)
# replace all comments behind numbers with nothing
 data[data.columns[1]] = data[data.columns[1]].str.replace(r'[a-zA-Z].*', '', regex=True)