Various companies, frameworks, and libraries have built functionality resembling relational databases to serve various goals: tighter language integration (R, pandas, linq), niche data applications (solr, mapping software), scaling machines or memories (hadoop, datomic, memcached), etc. The core power of SQL is that it allows you to describe what you want, allowing the machine to pick how to get it.
The general process used by Oracle is as follows:
Query -> Parse Tree -> Generate set of plans -> Scoring each plan -> Pick a plan -> Execution -> Output
An alternate path is to generate an execution plan using heuristics – an easier approach, and the obvious choice for many applications.
An example execution plan can be quite complex:
Rows Execution Plan
——– —————————————————-
12 SORT AGGREGATE
2 SORT GROUP BY
76563 NESTED LOOPS
76575 NESTED LOOPS
19 TABLE ACCESS FULL CN_PAYRUNS_ALL
76570 TABLE ACCESS BY INDEX ROWID CN_POSTING_DETAILS_ALL
76570 INDEX RANGE SCAN (object id 178321)
76563 TABLE ACCESS BY INDEX ROWID CN_PAYMENT_WORKSHEETS_ALL
11432983 INDEX RANGE SCAN (object id 186024)
This is equivalent to some original query, and there are potentially many other equivalent plans. I’d like an API / library (like R / pandas dataframs) which structured queries this way – one could then build a SQL implementation that constructed these plans, or build a new generation of SQL equivalent languages.
Let’s try a proof of concept with simple Javascript objects. We’ll start with a brain-dead way of defining tables, to illustrate how this might work:
schema = {
tables: {
people: [
{
first_name: 'gary',
last_name: 'sieling',
company: 1
},
{
first_name: 'ella',
last_name: 'sieling',
company: 2
}
],
companies: [
{
id: 1,
company_name: 'acme corp'
},
{
id: 2,
company_name: 'bubble'
}
]
},
indexes: {
company_pk: {
data: {
a: {
rowid: 2
},
b: {
rowid: 1
}
}
}
}
}
Ideally an execution plan might look something like this, with each function returning a Promise (or similar) so that results could be streamed back:
sort(
nested_loop_join(
nested_loop_join(
s({
select: ['rowid', 'first_name', 'last_name', 'company'],
from: schema.table1,
where: function(row) { return true }
}),
s({
select: ['rowid'],
from: schema.indexes.company_pk
}),
['rowid', 'rowid'],
['a', 'b']
),
s({
select: true,
from: schema.table1
})
)
)
Let’s start by writing a simple function that can actually pull rows from a table – this supports filtering. It returns a closure, which you call each time you want another row.
function s(x) {
var i = 0;
var where = function() { return true; }
if (x.where) where = x.where;
return function() {
var next = x.from[i++]
if (!next) {
return next;
}
if (where(next)) {
if (x.select) {
var res = {};
$.each(x.select, function(i, col) {
res[col] = next[col];
});
return res;
} else {
return next;
}
}
}
}
results = s({from:schema.tables.people})
results()
Object {first_name: "gary", last_name: "sieling", company: 1}
results()
Object {first_name: "ella", last_name: "sieling", company: 2}
results()
undefined
results = s({from:schema.tables.people, select: ['first_name']})
results()
Object {first_name: "gary"}
results()
Object {first_name: "ella"}
results()
results = s({from:schema.tables.people,
select: ['first_name'], where: function(row) { return row.first_name = 'gary' } })
results()
Object {first_name: "gary"}
results()
Now, to do something more interesting, lets try implementing a join. This is the Nested Loops join in the Oracle plan above – just two for loops with a where clause on the join.
function nested_loop_join(a, b, keys, select) {
output = []
second = [];
while(row_b = b()) {
second[second.length] = row_b;
}
while(row_a = a()) {
$.each(second, function(i, row_b) {
if (row_a[keys[0]] === row_b[keys[1]]) {
var new_row = {};
$.each(select, function(k, col) {
// cheat here for simplicity - should handle aliasing
new_row[col] =
(row_a[col] ? row_a[col] : row_b[col])
})
output[output.length] = new_row;
}
})
}
return s({from: output})
}
joined = nested_loop_join(
s({from:schema.tables.companies}),
s({from:schema.tables.people}),
["id", "company"],
["last_name", "company_name"])
joined()
Object {last_name: "sieling", company_name: "acme corp"}
joined()
Object {last_name: "sieling", company_name: "bubble"}
Up to this point, I’ve avoided implementing indexes due to complexity – lets do a different type of join, a cartesian join. This takes all the rows of the first and second tables regardless of whether they match, then applies a filter at the end:
function cartesian_join(a, b, keys, select) {
output = []
second = [];
while(row_b = b()) {
second[second.length] = row_b;
}
var cols = select;
if (cols) {
$.each(keys, function(i, c) {
if ($.inArray(c, select) === -1) {
cols[cols.length] = c;
}
});
}
while(row_a = a()) {
$.each(second, function(i, row_b) {
var new_row = {};
$.each(cols, function(k, col) {
// cheat here for simplicity - should handle aliasing
new_row[col] =
(row_a[col] ? row_a[col] : row_b[col])
});
output[output.length] = new_row;
})
}
return s({from: output, where: function(row) {
return row[keys[0]] === row[keys[1]];
}});
}
joined()
Object {last_name: "sieling", company_name: "acme corp", id: 1, company: 1}
joined()
undefined
joined()
undefined
joined()
Object {last_name: "sieling", company_name: "bubble", id: 2, company: 2}
Now, let’s write something in a more SQL-ish way – we don’t want to specify the implementation.
from(
schema.tables.company,
schema.tables.person
["company", "id"]
["last_name", "company"]
)
This from function can take two tables, and use a heuristic, like the old row-based optimzer, to filter the results:
function from(table1, table2, keys, cols) {
if (table1.length <= 5 && table2.length <= 5) {
return cartesian_join(s({from:table1}), s({from:table2}), keys, cols)
}
else {
return nested_loop_join(s({from:table1}), s({from:table2}), keys, cols)
}
}
joined = nested_loop_join(
s({from:schema.tables.companies}),
s({from:schema.tables.people}),
["id", "company"],
["last_name", "company_name"])
joined()
Object {last_name: "sieling", company_name: "acme corp"}
joined()
Object {last_name: "sieling", company_name: "bubble"}
To actually build a cost-based optimizer you'd need the ability to track things like table / index sample sizes, histograms, and so on, but could imagine then writing standalone implementations of algorithms in a large library.