PhantomJS is a headless WebKit, which lets you run Javascript in a browser from the command line. It adds additional API calls which facilitate automated testing, screenshots, and scraping. I thought it would be interesting to write a script to retrieve Adsense destination URLs and text with PhantomJS.
Extracting advertisement blocks requires fairly simple CSS selectors. Google can’t change the format too often, since each publisher must paste a code into their site. Some ad networks render advertisements inside an iframe, so running the script may run into browser security restrictions. Extracting ad data from a page of Home Depot’s website gives us the following results:
Drywall Materials Sale, http://www.compare99.com/compare.html%3Fq%3Ddrywall-products%26ort%3DDrywall-Materials-Sale%26adid%3DiaCkp56m1aqplM3OkH6Tp8bUzJKepofRzm52pdrZxJ2eYK7D15aknMLO1lelcMjD2KRYlsnD1W6W Sheetrock, http://shopping.yahoo.com/search%3B_ylc%3DX3oDMTJ1dGkyY2Y5BF9TAzk2MDc5MjYwBGsDc2hlZXRyb2NrBHNlbV9hY3QDMjYyOTkxMDA5MARzZW1fYWRnAzE5NjgwNTY2MwRzZW1fY21wAzM3NDI5MTMEc2VtX2t3aWQDMTU0NTgwMDE-%3Fp%3Dsheetrock%26sem%3DGoogle Sheetrock Material Sale, http://www.buycheapr.com/us/result.jsp%3Fga%3Dus19%26q%3Dsheetrock%2Bmaterial Installation Framing Door, http://www.moifriefacility.com Architectural GFRG, http://www.sbgrace.com WallBuilders Library, http://www.logos.com/products/details/2982%3Fgoogleads
I’ve written a short demo, which retrieves ad text and a screenshot for testing. It is invoked as follows (source is below, and on Github)
phantomjs adsense.js http://www.homedepot.com/Building-Materials-Drywall/FibaTape/h_d1/N-5yc1vZar3dZ38m/h_d2/Navigation?catalogId=10053&Nu=P_PARENT_ID&langId=-1&storeId=10051
The code is almost a little too easy- tell PhantomJS to load a page, run Javascript in the page context, and parse the Adsense URL format. As a programming paradigm, it’s a little complex to track scope, since some code runs in the PhantomJS context and some in the page context. PhantomJS scripts do not exit when a script ends, because many browser actions are asynchronous. This requires scripts to track state and add exit() calls at the end of every branch.
var page = require('webpage').create(),
t, address;
if (phantom.args.length === 0) {
console.log('Usage: phantomjs adsense.js ');
phantom.exit();
} else {
t = Date.now();
address = phantom.args[0];
output = phantom.args[1];
page.viewportSize = { width: 600, height: 600 };
page.onConsoleMessage = function (msg) {
console.log('Console log: ' + msg);
};
page.open(address, function (status) {
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit();
} else {
page.evaluate(function () {
var parse = function(query) {
var vars = query.split("?")[1].split("&");
var res = {};
for (var i = 0; i < vars.length; i++) {
var pair = vars[i].split("=");
res[pair[0]] = unescape(pair[1]);
}
return res;
};
console.log(document.title);
var ads = document.querySelectorAll('#googleAdSenseLeft ul li a');
for (var i=0; i
The actual script output is JSON, and a little tedious to read:
Console log: Building Materials - Drywall - FibaTapeĀ at The Home Depot
Console log: {"sa":"l","ai":"CtXg4mpkIUPygG6Ol0AGRloCYCcupmcoEi9O58FOzp_mMrgEQByDHxt8eKAxQ_aetrgRgyb6miYyk1A-gAe2HlNYDyAEBqgQbT9CqfdRE5uzDpvzqUgRUNqtZ3ouY_UBn7VGD","num":"7","sig":"AOD64_0jc0_3b0Au9uLLeud6cAI77O6zrQ","adurl":"http://www.compare99.com/compare.html?q=drywall-products&ort=Drywall-Materials-Sale&adid=iaCkp56m1aqplM3OkH6Tp8bUzJKepofRzm52pdrZxJ2eYK7D15aknMLO1lelcMjD2KRYlsnD1W6W","url":"http://www.google.com/aclk?sa=l&ai=CtXg4mpkIUPygG6Ol0AGRloCYCcupmcoEi9O58FOzp_mMrgEQByDHxt8eKAxQ_aetrgRgyb6miYyk1A-gAe2HlNYDyAEBqgQbT9CqfdRE5uzDpvzqUgRUNqtZ3ouY_UBn7VGD&num=7&sig=AOD64_0jc0_3b0Au9uLLeud6cAI77O6zrQ&adurl=http://www.compare99.com/compare.html%3Fq%3Ddrywall-products%26ort%3DDrywall-Materials-Sale%26adid%3DiaCkp56m1aqplM3OkH6Tp8bUzJKepofRzm52pdrZxJ2eYK7D15aknMLO1lelcMjD2KRYlsnD1W6W","text":"Drywall Materials Sale"}
Console log: {"sa":"L","ai":"CHaiampkIUPygG6Ol0AGRloCYCZu8jlqzr4eAA9G9rwcQCCDHxt8eKAxQ7dCHowNgyb6miYyk1A_IAQGqBB5P0Loa2ETp7MOm_KJSgnGFYNjxasFE9TqY0t8TLpE","num":"8","ggladgrp":"2492582816717521168","gglcreat":"9712000621987456871","sig":"AOD64_1WwDM7Zp2jGv1pdrozELP2CSkZUA","adurl":"http://shopping.yahoo.com/search;_ylc=X3oDMTJ1dGkyY2Y5BF9TAzk2MDc5MjYwBGsDc2hlZXRyb2NrBHNlbV9hY3QDMjYyOTkxMDA5MARzZW1fYWRnAzE5NjgwNTY2MwRzZW1fY21wAzM3NDI5MTMEc2VtX2t3aWQDMTU0NTgwMDE-?p=sheetrock&sem=Google","url":"http://www.google.com/aclk?sa=L&ai=CHaiampkIUPygG6Ol0AGRloCYCZu8jlqzr4eAA9G9rwcQCCDHxt8eKAxQ7dCHowNgyb6miYyk1A_IAQGqBB5P0Loa2ETp7MOm_KJSgnGFYNjxasFE9TqY0t8TLpE&num=8&ggladgrp=2492582816717521168&gglcreat=9712000621987456871&sig=AOD64_1WwDM7Zp2jGv1pdrozELP2CSkZUA&adurl=http://shopping.yahoo.com/search%3B_ylc%3DX3oDMTJ1dGkyY2Y5BF9TAzk2MDc5MjYwBGsDc2hlZXRyb2NrBHNlbV9hY3QDMjYyOTkxMDA5MARzZW1fYWRnAzE5NjgwNTY2MwRzZW1fY21wAzM3NDI5MTMEc2VtX2t3aWQDMTU0NTgwMDE-%3Fp%3Dsheetrock%26sem%3DGoogle","text":"Sheetrock"}
Console log: {"sa":"L","ai":"CELX4mpkIUPygG6Ol0AGRloCYCc-MjpECz_OgsCKf8OKPCRAJIMfG3x4oDFDFt4T4-f____8BYMm-pomMpNQPyAEBqgQeT9CaZ9JE6OzDpvyiUrJ702HY8WrBRPU6mNLfEy6R","num":"9","sig":"AOD64_39oakLNPF7SIjdARg9y73otRYZhQ","adurl":"http://www.buycheapr.com/us/result.jsp?ga=us19&q=sheetrock+material","url":"http://www.google.com/aclk?sa=L&ai=CELX4mpkIUPygG6Ol0AGRloCYCc-MjpECz_OgsCKf8OKPCRAJIMfG3x4oDFDFt4T4-f____8BYMm-pomMpNQPyAEBqgQeT9CaZ9JE6OzDpvyiUrJ702HY8WrBRPU6mNLfEy6R&num=9&sig=AOD64_39oakLNPF7SIjdARg9y73otRYZhQ&adurl=http://www.buycheapr.com/us/result.jsp%3Fga%3Dus19%26q%3Dsheetrock%2Bmaterial","text":"Sheetrock Material Sale"}
Console log: {"sa":"L","ai":"C-S13mpkIUPygG6Ol0AGRloCYCaS2oM0D5P7ugla6r8cGEAogx8bfHigMULiOjo_9_____wFgyb6miYyk1A_IAQGqBBhP0Np5zUTr7MOm_LNT0dUZSgBnPsWIwzM","num":"10","sig":"AOD64_0rVOUBr5lndIy_sed-v9kmQBeqjw","adurl":"http://www.moifriefacility.com","url":"http://www.google.com/aclk?sa=L&ai=C-S13mpkIUPygG6Ol0AGRloCYCaS2oM0D5P7ugla6r8cGEAogx8bfHigMULiOjo_9_____wFgyb6miYyk1A_IAQGqBBhP0Np5zUTr7MOm_LNT0dUZSgBnPsWIwzM&num=10&sig=AOD64_0rVOUBr5lndIy_sed-v9kmQBeqjw&adurl=http://www.moifriefacility.com","text":"Installation Framing Door"}
Console log: {"sa":"L","ai":"CnycAmpkIUPygG6Ol0AGRloCYCem7q4oEqYSS7FKunu8KEAsgx8bfHigMUJbYl_L8_____wFgyb6miYyk1A_IAQGqBB5P0IonikTq7MOm_KJShBu3Z9jxasFE9TqY0t8TLpE","num":"11","sig":"AOD64_1MZGFM0lJ7DsgtuZZ-rv2CP6vcxA","adurl":"http://www.sbgrace.com","url":"http://www.google.com/aclk?sa=L&ai=CnycAmpkIUPygG6Ol0AGRloCYCem7q4oEqYSS7FKunu8KEAsgx8bfHigMUJbYl_L8_____wFgyb6miYyk1A_IAQGqBB5P0IonikTq7MOm_KJShBu3Z9jxasFE9TqY0t8TLpE&num=11&sig=AOD64_1MZGFM0lJ7DsgtuZZ-rv2CP6vcxA&adurl=http://www.sbgrace.com","text":"Architectural GFRG"}
Console log: {"sa":"l","ai":"C86EtmpkIUPygG6Ol0AGRloCYCcComAjImM7lA5iY2DAQDCDHxt8eKAxQzZGFtAFgyb6miYyk1A-gAZCCsf8DyAEBqgQbT9D6P8RE7ezDpvzqUgRUNqtZ3ouY_UBn7VGT","num":"12","sig":"AOD64_0n--_8h8e-W75X5eNYIOhLyJ7ezQ","adurl":"http://www.logos.com/products/details/2982?googleads","url":"http://www.google.com/aclk?sa=l&ai=C86EtmpkIUPygG6Ol0AGRloCYCcComAjImM7lA5iY2DAQDCDHxt8eKAxQzZGFtAFgyb6miYyk1A-gAZCCsf8DyAEBqgQbT9D6P8RE7ezDpvzqUgRUNqtZ3ouY_UBn7VGT&num=12&sig=AOD64_0n--_8h8e-W75X5eNYIOhLyJ7ezQ&adurl=http://www.logos.com/products/details/2982%3Fgoogleads","text":"WallBuilders Library"}
Console log: {"sa":"l","ai":"C86EtmpkIUPygG6Ol0AGRloCYCcComAjImM7lA5iY2DAQDCDHxt8eKAxQzZGFtAFgyb6miYyk1A-gAZCCsf8DyAEBqgQbT9D6P8RE7ezDpvzqUgRUNqtZ3ouY_UBn7VGT","num":"12","sig":"AOD64_0n--_8h8e-W75X5eNYIOhLyJ7ezQ","adurl":"http://www.logos.com/products/details/2982?googleads","url":"http://www.google.com/aclk?sa=l&ai=C86EtmpkIUPygG6Ol0AGRloCYCcComAjImM7lA5iY2DAQDCDHxt8eKAxQzZGFtAFgyb6miYyk1A-gAZCCsf8DyAEBqgQbT9D6P8RE7ezDpvzqUgRUNqtZ3ouY_UBn7VGT&num=12&sig=AOD64_0n--_8h8e-W75X5eNYIOhLyJ7ezQ&adurl=http://www.logos.com/products/details/2982%3Fgoogleads","text":"www.logos.com/"}
Looking at the output, some design decisions made by Google's engineers become apparent. Google must track all clicks in order to charge publishers and pay advertisers, so they redirect everything through a URL shortener. The latency must be low or else the viewer will give up waiting for a site to load.
Links contain all information required to load the advertised site, so no database reads are required. The URL contains hashes, which presumably prevents a malicious user from modifying the URL. I suspect that these URLs also expire, by including the date in a hashed value. Clicks are likely written to a sharded database (i.e. BigTable, see also Redis, Cassandra, etc) and reconciled later.
Many thanks to Ariele for editing
I’m not sure if they changed this since your article, but the code doesn’t work. The iframe contents is not available to scrape via JavaScript. In your example, you have the following:
document.querySelectorAll(‘#googleAdSenseLeft ul li a’);
This doesn’t return anything because the contents of #googleAdSenseLeft are in an iFrame.
When using the console, I try the following:
document.querySelectorAll(‘#googleAdSenseLeft iframe html’);
It returns an empty array as well.
I tried this in Safari, Chrome and Firefox.
Any thoughts on how to overcome this?
Eric
I have a very simple website where the javascript like
shows a google ad.
Im trying to modify your script to read into the generated iframe, but… it seems to generate anothe iframe…. but there must be a way to get to the ads using phantomjs?
I just fall in love with phantomjs and casperjs