Scraping Tabular Websites into a CSV file using PhantomJS

While there are many tools for scraping website content, two of my current favorites are PhantomJS (Javascript) and BeautifulSoup (Python). Many small scale problems are easily solved downloading files with wget, then post-processing – this works well, as the post-processing typically requires dozens of iterations to extract clean data. If you’re wondering about the legitimacy of this, it’s not just for content thieves: it’s recommended by archive.org for downloading content from their site, and it’s the currently prescribed method for bulk access to Philadelphia and Pennsylvania Laws, which really should be more easily accessible to the public.

In this example, I have ~350 separate files, each containing a table. The Javascript code goes in a file called csv.js, which you can run like so:

phantomjs csv.js

The first step is to retrieve a single cell from a single file, likely a page which lists links you want to download.

To do this, use your favorite non-IE browser, right click an element you are interested in, select “Inspect Element”. In the Developer Tools window, select “Copy XPath”. If all goes well, this will have array indexes in it, and you can remove one or more of these to return all rows/columns, as needed:

nodes = document.evaluate( '//*[@id="hm-lower-background"]/div/a' ,document, 
   null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null )

Once you have this, you could easily convert a single HTML table element into a CSV file. I use this to pull the text and URLs of a series of links from a page, which is then fed to wget:

function csvEscape(value) {
  return '"' + value.replace(/"/g, '\\"') + '"'
}

for (var i = 0; i < nodes.snapshotLength; i++) { 
  var item = nodes.snapshotItem(i);
  if (item.innerText != "") {
    console.log(csvEscape(item.innerText) + "," + csvEscape(item.href)); 
  }
}

This will give you output like this:

"Cathedral Basilica of SS Peter&Paul, Philadelphia PHILA. SOUTH","http://archphila.org/parishes/7000.php"
"St. Agatha , Philadelphia ","http://archphila.org/parishes/0.php"
"St. Adalbert (Polish) , Philadelphia PHILADELPHIA-NORTH","http://archphila.org/parishes/7485.php"
"St. Agatha-St. James , Philadelphia PHILADELPHIA-SOUTH","http://archphila.org/parishes/7490.php"
"St. Agnes , Sellersville BUCKS COUNTY","http://archphila.org/parishes/7500.php"
"St. Agnes , West Chester CHESTER COUNTY","http://archphila.org/parishes/7505.php"
"St. Agnes-St. John Nepomucene (Slovak) , Philadelphia PHILADELPHIA-SOUTH","http://archphila.org/parishes/7495.php"
"St. Albert the Great , Huntingdon Valley MONTGOMERY COUNTY","http://archphila.org/parishes/7510.php"
"St. Alice , Upper Darby DELAWARE COUNTY","http://archphila.org/parishes/7515.php"
"All Saints , Philadelphia PHILADELPHIA-NORTH","http://archphila.org/parishes/7010.php"

You can put all these URLs into a single file, and retrieve each with wget:

wget -i files.txt

It is often helpful to rename all the files – if they are returned with an extension like .php or .aspx, you can rename them to .html so you can open individual files in the browser to preview.

for i in *.php ; do
    mv -v $i ${i%.php}.html
done

Next up, open a single file – get xpaths again. In my example, I have a two column grid I want to extract, then pivot, so that each file represents a row in the resulting CSV output.

This function will loop over all the rows, using the left column as the names (which will late be CSV column names), and the right columns as the values.

function text()
{
  var xpath1 = '//*[@id="hm-lower-background"]/div/table/tbody/tr/td[1]';
  var xpath2 = '//*[@id="hm-lower-background"]/div/table/tbody/tr/td[2]';

  function arr(xpath) {
    var nodes = document.evaluate(xpath, document,
    null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null );

    var arr = [];
    for (var i = 0; i < nodes.snapshotLength; i++) {
      var item = nodes.snapshotItem(i);
      arr[i] = item.innerText.trim();
    }

    return arr;
  }

  var arr1 = arr(xpath1),
      arr2 = arr(xpath2);
  res = {}; 
  res['id'] = document.URL.substring(document.URL.lastIndexOf("/") + 1);
  res['id'] = document.URL.substring(document.URL.lastIndexOf("/") + 1);
  for (var i = 0; i < Math.min(arr1.length, arr2.length); i++) {
    res[arr1[i]] = arr2[i];
  }

  return res;
}

One of the handy things about PhantomJS is that it has access to the file system, so we can write the whole script in one language without being forced to resort to Windows Scripting Host. The next step is to loop through the HTML files, finding the appropriate files to parse:

function ls() {
  var fs = require('fs');

  var result = [];
  var files = fs.list('.');
  for (var i = 0; i < files.length; i++) {
    if (files[i].indexOf(".html") === files[i].length - 5) {
      result[result.length++] = files[i];
    }
  }
  return result;
}

We then define a function to process each page. This shows one of the idiosyncratic aspects of this type of programming: scoping. We have the ability to run code inside and outside the page context, but there is a fairly tight sandbox between them. I’ve used the console.log to pass messages out of the page to the PhantomJS wrapper – this is an area where Python would have been simpler.

To make this actually work, we also need to provide functions to handle the iteration over pages, the final step, and the aggregation of data from each page. The “text” function defined above needs to be inserted inside the page context.

function openPage(next, finish, aggregate) {
  var page = require('webpage').create(),
  system = require('system');

  page.onConsoleMessage = function (msg) {
    if (msg.substring(0, "aggregationMessage:".length) ===
        "aggregationMessage:") {
      aggregate(msg.substring("aggregationMessage".length));
      openPage(next, finish, aggregate);
    }
  };

  var url = next();
  console.log("Loading " + url);
  if (url == undefined) {
    finish();
  return;
}

page.open(encodeURI(url), function (status) {
  // Check for page load success
  if (status !== "success") {
    console.log("Unable to access network");
  } else {
    page.evaluate(function() {
    console.log("Loading...");
    <<< INSERT TEXT FUNCTION HERE >>
    console.log("aggregationMessage:" + JSON.stringify(text()));
      });
    }
  });
}

Finally, we can define functions to handle aggregation – here we need to remember to tell PhantomJS to exit when it finishes. This is all defined within a function to keep the scope contained.

function run() {
  var results = "[";
  var idx = 0;
  var files = ls();

  function next() {
    return files[idx++];
  }

  function save(result) {
    var fs = require('fs'),
    system = require('system');

   try {
      fs.write('result.json', result, 'w');
    } catch(e) {
      console.log(e);
    }

    phantom.exit();
  }

  function finish() {
    console.log("Saving results");
    save(results.substring(0, results.length - 1) + "]");
    phantom.exit();
  }

  function aggregate(val) {
    results = results + val + ",";
  }
  openPage(next, finish, aggregate);
}

run();

The above code will generate a nice big JSON file, with everything we asked for:

[{"id":"0.html","Pastoral Planning Area":"",
"Deanery":"","Deanery Description":"","Address":"",
"Founded":"January, 1865","Rectory":"","Phone":"",
"Pastor":"","Email":"","Masses":"","Confession Schedule":"","
School":"","School Principal":"","Report on the Parish School":""
]

Now that we have this, we can add functions to pivot this data into CSV form- this is a good place to add post processing. I found that it takes a few tries to get the data you want, for instance in this dataset the “Founded” field is in the form of “Month, Year” – I want additional fields with “Year” and “Age” calculated ahead of time. For this dataset, I also need to modify the addresses to be in a form that a geocoder can handle easily.

var columns = {}; 
for (var i = 0; i < x.length; i++) 
{ 
  for (j in x[i]) 
  { 
    columns[j] = ''; 
  } 
}

function nvl(a, b) {
  if (a == undefined) {
    return b
  } else {
    return a
  }
}

function useColumn(k, row) {
  return (k === "id" || k === "Founded" || k === "Address") &&
    (row["Address"] !== "");
}

var rows = '';
for (j in columns) {
  if (useColumn(j, {})) {
    rows += j + ","
  } 
}

var anyCol = false;
for (var i = 0; i < x.length; i++) {
  if (anyCol) {
    rows += "\n";
    anyCol = false;
  }
  for (j in columns) {
    if (useColumn(j, x[i])) {
      rows += '"' + nvl(x[i][j]) + '",';
      anyCol = true;
    }
  }
}

At last after many steps, we have one large CSV file, perfect for import into your database or GIS software of choice:

"name","desc","color","founded","year","age"
"Buckius and E. Thompson Sts., Philadelphia, PA",-0,,"November, 1860",1860,153
"S.E. Cor. 10th and Dickinson Sts., Philadelphia, PA",-0,,"December, 1860",1860,153

While time-consuming, this is fairly rewarding, as you can see from the finished product, a map of churches in the Philadelphia region.