Generating ARFF files for Weka from Postgres

Since all my scraped data is in Postgres, this is the easiest way to get it out – the fastest iteration possible. At some point I’ll probably switch to a Java library. It’s interesting to see, but probably the only lesson from this is that all ETL scripts are ugly.

with advertisers_ranked as (
	select advertiser_id, replace(replace(lower(advertiser), ' ', '_'), '/', '_') advertiser, 
	6 + dense_rank() over (partition by 1 order by advertiser) advertiser_rank -- 6 for the number of attributes prior to the 'advertiser' attributes
	from advertisers
)
select '@RELATION flippa' line
union all
select '@ATTRIBUTE default numeric' line
union all
select '@ATTRIBUTE siteid string' line
union all
select '@ATTRIBUTE banned {0,1}' line
union all
select '@ATTRIBUTE length numeric' line
union all
select '@ATTRIBUTE h1 numeric' line
union all
select '@ATTRIBUTE h2 numeric' line
union all
select '@ATTRIBUTE h3 numeric' line
union all
(select '@ATTRIBUTE ' || advertiser || ' {0, 1}' line
from advertisers_ranked order by advertiser_rank)
union all
select '@DATA' line
union all
-- there are N advertisers per row, this combines them into one
select '{' || siteid || ', ' || banned || ', ' || length || ', ' || h1 || ', ' || h2 || ', ' || h3 || ', ' || array_to_string(array_agg(advertiser ORDER BY advertiser_rank), ', ') || '}' line
from (
	select distinct
	        '1 ' || s.site_id siteid, 
		'2 ' || (case when seller like '%banned%' then 1 else 0 end) as banned, 
		'3 ' || char_length(description) length,
		'4 ' || (length(description) - length(regexp_replace(lower(description),'h1','','g'))) / length('h1') h1, 
		'5 ' || (length(description) - length(regexp_replace(lower(description),'h2','','g'))) / length('h2') h2,
		'6 ' || (length(description) - length(regexp_replace(lower(description),'h3','','g'))) / length('h3') h3, 
		advertiser_rank || ' 1' advertiser,
		advertiser_rank
	from sites s
	join sites_advertisers on s.site_id = sites_advertisers.site_id 
	join advertisers_ranked a on a.advertiser_id = sites_advertisers.advertiser_id
	join auctions on auctions.site_id = s.site_id
	) a
group by siteid, banned, length, h1, h2, h3

Leave a Reply

Your email address will not be published. Required fields are marked *