This example shows how to use the Beautiful Soup library to find all images referenced in a bunch of html files, then filter to a particular size range – this works well to take out header images, logos, tracking pictures, etc. This assumes a system where you mirrored a website’s directory structure with wget. Unlike most examples this handles some real data – e.g. 404’ed images and different URL structures.
from bs4 import BeautifulSoup
import Image
import urllib2
import os, re
domain = "www.example.com"
indir = ""
fre = re.compile(".*html$")
for root, dirs, filenames in os.walk(indir):
for f in filenames:
if fre.match(f):
htmlfile = os.path.join(root, f)
print "Parsing " + htmlfile
soup = BeautifulSoup(open(htmlfile), "html5lib")
images = soup.find_all('img')
def f(x) :
if (x['src'] is None):
return False
imgSrc = x['src']
if ("http://" in imgSrc):
imgSrc = imgSrc[7:]
if domain not in imgSrc:
imgSrc = os.path.join(root, imgSrc)
imgFile = imgSrc
try:
img = Image.open(imgFile)
width, height = img.size
return width > 150 and height > 150
except IOError:
return False
art = filter(f, images)
print(art)