python - Gary Sieling

This example shows how to use the Beautiful Soup library to find all images referenced in a bunch of html files, then filter to a particular size range – this works well to take out header images, logos, tracking pictures, etc. This assumes a system where you mirrored a website’s directory structure with wget. Unlike most examples this handles some real data – e.g. 404’ed images and different URL structures.

from bs4 import BeautifulSoup
import Image
import urllib2
import os, re
 
domain = "www.example.com"
indir = ""
fre = re.compile(".*html$")

for root, dirs, filenames in os.walk(indir):
    for f in filenames:
        if fre.match(f):
            htmlfile = os.path.join(root, f)
            print "Parsing "  + htmlfile
 
            soup = BeautifulSoup(open(htmlfile), "html5lib")
 
            images = soup.find_all('img')
            def f(x) : 
                if (x['src'] is None):
                    return False

                imgSrc = x['src']

                if ("http://" in imgSrc):
                    imgSrc = imgSrc[7:]

                if domain not in imgSrc:
                    imgSrc = os.path.join(root, imgSrc)

                imgFile = imgSrc

                try: 
                    img = Image.open(imgFile)
                    width, height = img.size
                    return width > 150 and height > 150
                except IOError:
                    return False

            art = filter(f, images)
            print(art)

Tag: python

Finding all images in HTML files over a certain size with Python BeautifulSoup