Since all my scraped data is in Postgres, this is the easiest way to get it out – the fastest iteration possible. At some point I’ll probably switch to a Java library. It’s interesting to see, but probably the only lesson from this is that all ETL scripts are ugly.
with advertisers_ranked as (
select advertiser_id, replace(replace(lower(advertiser), ' ', '_'), '/', '_') advertiser,
6 + dense_rank() over (partition by 1 order by advertiser) advertiser_rank -- 6 for the number of attributes prior to the 'advertiser' attributes
from advertisers
)
select '@RELATION flippa' line
union all
select '@ATTRIBUTE default numeric' line
union all
select '@ATTRIBUTE siteid string' line
union all
select '@ATTRIBUTE banned {0,1}' line
union all
select '@ATTRIBUTE length numeric' line
union all
select '@ATTRIBUTE h1 numeric' line
union all
select '@ATTRIBUTE h2 numeric' line
union all
select '@ATTRIBUTE h3 numeric' line
union all
(select '@ATTRIBUTE ' || advertiser || ' {0, 1}' line
from advertisers_ranked order by advertiser_rank)
union all
select '@DATA' line
union all
-- there are N advertisers per row, this combines them into one
select '{' || siteid || ', ' || banned || ', ' || length || ', ' || h1 || ', ' || h2 || ', ' || h3 || ', ' || array_to_string(array_agg(advertiser ORDER BY advertiser_rank), ', ') || '}' line
from (
select distinct
'1 ' || s.site_id siteid,
'2 ' || (case when seller like '%banned%' then 1 else 0 end) as banned,
'3 ' || char_length(description) length,
'4 ' || (length(description) - length(regexp_replace(lower(description),'h1','','g'))) / length('h1') h1,
'5 ' || (length(description) - length(regexp_replace(lower(description),'h2','','g'))) / length('h2') h2,
'6 ' || (length(description) - length(regexp_replace(lower(description),'h3','','g'))) / length('h3') h3,
advertiser_rank || ' 1' advertiser,
advertiser_rank
from sites s
join sites_advertisers on s.site_id = sites_advertisers.site_id
join advertisers_ranked a on a.advertiser_id = sites_advertisers.advertiser_id
join auctions on auctions.site_id = s.site_id
) a
group by siteid, banned, length, h1, h2, h3