Importing Necessary Libraries¶

In [1]:

import requests
from bs4 import BeautifulSoup as bs

Loading our First Page¶

In [5]:

# Load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>

Start using BeautifulSoup to Scrape¶

In [8]:

first_header = soup.find('h2')
first_header

Out[8]:

<h2>A Header</h2>

In [9]:

headers = soup.find_all('h2')
headers

Out[9]:

[<h2>A Header</h2>, <h2>Another header</h2>]

In [10]:

# Pass in a list of elements to look for
first_header = soup.find(["h1", "h2"])
first_header

Out[10]:

<h1>HTML Webpage</h1>

In [11]:

first_header = soup.find(["h2", "h1"])
first_header

Out[11]:

<h1>HTML Webpage</h1>

In [12]:

headers = soup.find_all(["h1", "h2"])
headers

Out[12]:

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [16]:

# You can pass in attributes to the find/find_all function
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

Out[16]:

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:

# You can nest find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

Out[19]:

<h1>HTML Webpage</h1>

In [21]:

# We can search specific strings in our find/find_all calls

import re

para = soup.find_all('p', string=re.compile('Some'))
para

Out[21]:

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [22]:

head = soup.find_all('h2', string=re.compile('(H|h)eader'))
head

Out[22]:

[<h2>A Header</h2>, <h2>Another header</h2>]

Select (CSS Selector)¶

In [24]:

content = soup.select('div p')
content

Out[24]:

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [25]:

pg = soup.select('h2 ~ p')
pg

Out[25]:

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:

bold = soup.select('p#paragraph-id b')
bold

Out[28]:

[<b>Some bold text</b>]

In [40]:

paras = soup.select('body > p')
print(paras)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]

In [41]:

for para in paras:
    print(para.select("i"))

[<i>Some italicized text</i>]
[]

In [35]:

# Grab by element with specific property
soup.select("[align=middle]")

Out[35]:

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

Get different properties of the HTML¶

Getting Strings from HTML¶

In [45]:

# use .string
soup.find('h2').string

Out[45]:

'A Header'

In [49]:

# If multiple child elements use get_text
div = soup.find('div')
print(div.get_text())

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html

Getting Links from HTML¶

In [50]:

# Get a specific property from an element
link = soup.find('a')
link['href']

Out[50]:

'https://keithgalli.github.io/web-scraping/webpage.html'

Subsetting to get what you want from HTML¶

In [51]:

paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

Out[51]:

'paragraph-id'

In [61]:

# Know the terms: Parent, Sibling, Child
soup.body.find("div").find_parents()

Out[61]:

[<body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>,
 <html>
 <head>
 <title>HTML Example</title>
 </head>
 <body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>
 </html>,
 <html>
 <head>
 <title>HTML Example</title>
 </head>
 <body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>
 </html>]

In [62]:

soup.body.find("div").find_parent()

Out[62]:

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [63]:

soup.body.find("div").find_previous_siblings()

Out[63]:

[]

In [64]:

soup.body.find("div").find_previous_sibling()

In [59]:

soup.body.find("div").find_next_siblings()

Out[59]:

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [60]:

soup.body.find("div").find_next_sibling()

Out[60]:

<h2>A Header</h2>

Exercises¶

Loading the webpage¶

In [2]:

# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
wp = bs(r.content)

# Print out our html
print(wp.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmit">
    youtube.com/kgmit
   </a>
  </p>
  <p>
   I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.
  </p>
  <h3>
   Hobbies
  </h3>
  <p>
   Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are
   <i>
    Settlers of Catan
   </i>
   and
   <i>
    Othello
   </i>
   .
  </p>
  <h3>
   Fun Facts
  </h3>
  <ul class="fun-facts">
   <li>
    Owned my dream car in high school
    <a href="#footer">
     <sup>
      1
     </sup>
    </a>
   </li>
   <li>
    Middle name is Ronald
   </li>
   <li>
    Never had been on a plane until college
   </li>
   <li>
    Dunkin Donuts coffee is better than Starbucks
   </li>
   <li>
    A favorite book series of mine is
    <i>
     Ender's Game
    </i>
   </li>
   <li>
    Current video game of choice is
    <i>
     Rocket League
    </i>
   </li>
   <li>
    The band that I've seen the most times live is the
    <i>
     Zac Brown Band
    </i>
   </li>
  </ul>
  <h2>
   Social Media
  </h2>
  I encourage you to check out my content on all social media platforms
  <br/>
  <ul class="socials">
   <li class="social instagram">
    <b>
     Instagram:
    </b>
    <a href="https://www.instagram.com/keithgalli/">
     https://www.instagram.com/keithgalli/
    </a>
   </li>
   <li class="social twitter">
    <b>
     Twitter:
    </b>
    <a href="https://twitter.com/keithgalli">
     https://twitter.com/keithgalli
    </a>
   </li>
   <li class="social linkedin">
    <b>
     LinkedIn:
    </b>
    <a href="https://www.linkedin.com/in/keithgalli/">
     https://www.linkedin.com/in/keithgalli/
    </a>
   </li>
   <li class="social tiktok">
    <b>
     TikTok:
    </b>
    <a href="https://www.tiktok.com/@keithgalli">
     https://www.tiktok.com/@keithgalli
    </a>
   </li>
  </ul>
  <h2>
   Photos
  </h2>
  Here are a few photos from a trip to italy I took last year
  <div class="row">
   <div class="column">
    <img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>
   </div>
   <div class="column">
    <img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>
   </div>
   <div class="column">
    <img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>
   </div>
  </div>
  <div>
  </div>
  <h2>
   Table
  </h2>
  My MIT hockey stats :)
  <br/>
  <table class="hockey-stats">
   <thead>
    <tr>
     <th class="season" data-sort="">
      S
     </th>
     <th class="team" data-sort="team">
      Team
     </th>
     <th class="league" data-sort="league">
      League
     </th>
     <th class="regular gp" data-sort="gp">
      GP
     </th>
     <th class="regular g" data-sort="g">
      G
     </th>
     <th class="regular a" data-sort="a">
      A
     </th>
     <th class="regular tp" data-sort="tp">
      TP
     </th>
     <th class="regular pim" data-sort="pim">
      PIM
     </th>
     <th class="regular pm" data-sort="pm">
      +/-
     </th>
     <th class="separator">
     </th>
     <th class="postseason">
      POST
     </th>
     <th class="postseason gp" data-sort="playoffs-gp">
      GP
     </th>
     <th class="postseason g" data-sort="playoffs-g">
      G
     </th>
     <th class="postseason a" data-sort="playoffs-a">
      A
     </th>
     <th class="postseason tp" data-sort="playoffs-tp">
      TP
     </th>
     <th class="postseason pim" data-sort="playoffs-pim">
      PIM
     </th>
     <th class="postseason pm" data-sort="playoffs-pm">
      +/-
     </th>
    </tr>
   </thead>
   <tbody>
    <tr class="team-continent-NA">
     <td class="season sorted">
      2014-15
     </td>
     <td class="team">
      <i>
       <img src="images/flag.png"/>
      </i>
      <span class="txt-blue">
       <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats">
        MIT (Mass. Inst. of Tech.)
       </a>
      </span>
     </td>
     <td class="league">
      <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015">
       ACHA II
      </a>
     </td>
     <td class="regular gp">
      17
     </td>
     <td class="regular g">
      3
     </td>
     <td class="regular a">
      9
     </td>
     <td class="regular tp">
      12
     </td>
     <td class="regular pim">
      20
     </td>
     <td class="regular pm">
     </td>
     <td class="separator">
      |
     </td>
     <td class="postseason">
      <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015">
      </a>
     </td>
     <td class="postseason gp">
     </td>
     <td class="postseason g">
     </td>
     <td class="postseason a">
     </td>
     <td class="postseason tp">
     </td>
     <td class="postseason pim">
     </td>
     <td class="postseason pm">
     </td>
    </tr>
    <tr class="team-continent-NA">
     <td class="season sorted">
      2015-16
     </td>
     <td class="team">
      <i>
       <img src="images/flag.png"/>
      </i>
      <span class="txt-blue">
       <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats">
        MIT (Mass. Inst. of Tech.)
       </a>
      </span>
     </td>
     <td class="league">
      <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016">
       ACHA II
      </a>
     </td>
     <td class="regular gp">
      9
     </td>
     <td class="regular g">
      1
     </td>
     <td class="regular a">
      1
     </td>
     <td class="regular tp">
      2
     </td>
     <td class="regular pim">
      2
     </td>
     <td class="regular pm">
     </td>
     <td class="separator">
      |
     </td>
     <td class="postseason">
      <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016">
      </a>
     </td>
     <td class="postseason gp">
     </td>
     <td class="postseason g">
     </td>
     <td class="postseason a">
     </td>
     <td class="postseason tp">
     </td>
     <td class="postseason pim">
     </td>
     <td class="postseason pm">
     </td>
    </tr>
    <tr class="team-continent-NA">
     <td class="season sorted">
      2016-17
     </td>
     <td class="team">
      <i>
       <img src="images/flag.png"/>
      </i>
      <span class="txt-blue">
       <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats">
        MIT (Mass. Inst. of Tech.)
       </a>
      </span>
     </td>
     <td class="league">
      <a href="https://www.eliteprospects.com/league/acha-ii/stats/2016-2017">
       ACHA II
      </a>
     </td>
     <td class="regular gp">
      12
     </td>
     <td class="regular g">
      5
     </td>
     <td class="regular a">
      5
     </td>
     <td class="regular tp">
      10
     </td>
     <td class="regular pim">
      8
     </td>
     <td class="regular pm">
      0
     </td>
     <td class="separator">
      |
     </td>
     <td class="postseason">
     </td>
     <td class="postseason gp">
     </td>
     <td class="postseason g">
     </td>
     <td class="postseason a">
     </td>
     <td class="postseason tp">
     </td>
     <td class="postseason pim">
     </td>
     <td class="postseason pm">
     </td>
    </tr>
    <tr class="team-continent-EU">
     <td class="season sorted">
      2017-18
     </td>
     <td class="team">
      Did not play
     </td>
     <td class="league">
      <a href="https://www.eliteprospects.com/stats">
      </a>
     </td>
     <td class="regular gp">
     </td>
     <td class="regular g">
     </td>
     <td class="regular a">
     </td>
     <td class="regular tp">
     </td>
     <td class="regular pim">
     </td>
     <td class="regular pm">
     </td>
     <td class="separator">
      |
     </td>
     <td class="postseason">
      <a href="https://www.eliteprospects.com/stats">
      </a>
     </td>
     <td class="postseason gp">
     </td>
     <td class="postseason g">
     </td>
     <td class="postseason a">
     </td>
     <td class="postseason tp">
     </td>
     <td class="postseason pim">
     </td>
     <td class="postseason pm">
     </td>
    </tr>
    <tr class="team-continent-NA">
     <td class="season sorted">
      2018-19
     </td>
     <td class="team">
      <i>
       <img src="images/flag.png"/>
      </i>
      <span class="txt-blue">
       <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats">
        MIT (Mass. Inst. of Tech.)
       </a>
      </span>
     </td>
     <td class="league">
      <a href="https://www.eliteprospects.com/league/acha-iii/stats/2018-2019">
       ACHA III
      </a>
     </td>
     <td class="regular gp">
      8
     </td>
     <td class="regular g">
      5
     </td>
     <td class="regular a">
      10
     </td>
     <td class="regular tp">
      15
     </td>
     <td class="regular pim">
      8
     </td>
     <td class="regular pm">
     </td>
     <td class="separator">
      |
     </td>
     <td class="postseason">
      <a href="https://www.eliteprospects.com/league/acha-iii/stats/2018-2019">
      </a>
     </td>
     <td class="postseason gp">
     </td>
     <td class="postseason g">
     </td>
     <td class="postseason a">
     </td>
     <td class="postseason tp">
     </td>
     <td class="postseason pim">
     </td>
     <td class="postseason pm">
     </td>
    </tr>
   </tbody>
  </table>
  <h2>
   Mystery Message Challenge!
  </h2>
  <p>
   If you scrape the links below grabbing the &lt;p&gt; tag with id="secret-word", you'll discover a secret message :)
  </p>
  <div width="50%">
   <div align="left" class="block">
    <ul>
     <li>
      <a href="challenge/file_1.html">
       File 1
      </a>
     </li>
     <li>
      <a href="challenge/file_2.html">
       File 2
      </a>
     </li>
     <li>
      <a href="challenge/file_3.html">
       File 3
      </a>
     </li>
     <li>
      <a href="challenge/file_4.html">
       File 4
      </a>
     </li>
     <li>
      <a href="challenge/file_5.html">
       File 5
      </a>
     </li>
    </ul>
   </div>
   <div align="center" class="block">
    <ul>
     <li>
      <a href="challenge/file_6.html">
       File 6
      </a>
     </li>
     <li>
      <a href="challenge/file_7.html">
       File 7
      </a>
     </li>
     <li>
      <a href="challenge/file_8.html">
       File 8
      </a>
     </li>
     <li>
      <a href="challenge/file_9.html">
       File 9
      </a>
     </li>
     <li>
      <a href="challenge/file_10.html">
       File 10
      </a>
     </li>
    </ul>
   </div>
  </div>
  <h2>
   Footnotes
  </h2>
  <p id="footer">
   1. This was actually a minivan that I named Debora. Maybe not my dream car, but I loved her nonetheless.
  </p>
 </body>
</html>

Link to the web page: https://keithgalli.github.io/web-scraping/webpage.html

Method 1¶

In [5]:

links = wp.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

Out[5]:

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Method 2¶

In [19]:

ulist = wp.find('ul', attrs={'class': 'socials'})
links = ulist.find_all(
    "a")  #adding this step because find doesn't give the output as a list
actual_links = [link['href'] for link in links]
actual_links

Out[19]:

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Method 3¶

In [22]:

links = wp.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

Out[22]:

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Method 4¶

In [31]:

links = wp.select("body ul li.social a")
actual_links = [link['href'] for link in links]
actual_links

Out[31]:

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Scraping the MIT Hockey Stats table¶