Gary Sieling

Six Join Implementations in Javascript

A join is an operation between two tables of data, combining the results by looking up keys from one table in a second table. While a simple operation in concept, there are many ways to do this and understanding the variations are important to understanding database behavior (for a discussion of how the algorithms are chosen see “SQL Execution Plans in Javascript“).

Obviously this is important for tuning, as it tells you ways you can influence the outcome, but also explains one of many reasons that database row output is not guaranteed from run to run of a single query.

To imagine how this work, we first set up a simple equivalent of a database table in Javascript:

schema = {
  tables: {
    people: [
      {
       first_name: 'gary',
       last_name: 'sieling',
       company: 1
      },
      {
       first_name: 'bob',
       last_name: 'sieling',
       company: 1
      },
      {
       first_name: 'ella',
       last_name: 'sieling',
       company: 2
      }
    ],
   companies: [
     {
       id: 1,
       company_name: 'acme corp'
     },
     {
       id: 2,
       company_name: 'bubble'
     }
   ]
  }
}

The simplest type of join is a nested loop join, which does a foreach on each table, only returning rows that match. If table A has m rows, and table B has n rows, this is m x n operations.

Note that this is an inner join – to do an outer join, one must track at each level whether a row has been added, and insert extras as needed.

function nested_loop_join(a, b, keys, select) {
  output = [];
 
  second = [];
  while(row_b = b()) {
    second[second.length] = row_b;
  }
   
  var idx = 0;
  while(row_a = a()) {
    $.each(second, function(i, row_b) {
      if (row_a[keys[0]] === row_b[keys[1]]) {
        var new_row = {};
 
        $.each(select, function(k, col) {
          // cheat here for simplicity - should handle aliasing
          new_row[col] =
            (row_a[col] ? row_a[col] : row_b[col])
        })
 
        output[idx++] = new_row;
      }
    })
  }
 
  return s({from: output})
}
 
joined = nested_loop_join(
  s({from:schema.tables.companies}),
  s({from:schema.tables.people}),
  ["id", "company"],
   ["last_name", "company_name"])

A second option is a cartesian join. This combines every row in table A with every row in table B, without comparing, then filters the results, to nothing. This is also m x n operations, and usually a bad idea, although if the two tables are small, it is sometimes chosen by a database.

function cartesian_join(a, b, keys, select) {
  output = []; 
  second = [];
  while(row_b = b()) {
    second[second.length] = row_b;
  }
 
  var cols = select;
  if (cols) {
    $.each(keys, function(i, c) {
      if ($.inArray(c, select) === -1) {
        cols[cols.length] = c;
      }
    });
  }
 
  var idx = 0;
  while(row_a = a()) {
    $.each(second, function(i, row_b) {
      var new_row = {};
 
      $.each(cols, function(k, col) {
        // cheat here for simplicity - should handle aliasing
        new_row[col] =
          (row_a[col] ? row_a[col] : row_b[col])
      });
 
      output[idx++] = new_row;
    })
  }
 
  return s({from: output, where: function(row) {
    return row[keys[0]] === row[keys[1]];
  }});
}

A third option is to put one of the tables into a hashmap, where the keys are lookup IDs, and the values point to rows. This operation takes m + n operations – ideally the larger of the two tables would be the one indexed, as having this pre-built turns it into n operations.

function hash_join(a, b, keys, select) {
  var use_first = a.length < b.length;
  var lookup = {};

  var table_to_join = use_first ? a : b;
  var key_to_join = use_first ? keys[0] : keys[1];
 
  var table_to_hash = use_first ? b : a;
  var key_to_hash = use_first ? keys[1] : keys[0];
 
  for (var i = 0; i < a.length; i++)
    lookup[key_to_hash] = table_to_hash[i][key_to_hash];

  var idx = 0;
  output = [];
  for (var j = 0; j < table_to_join.length; j++) {
    var new_row = {};
    var left_row = table_to_join[j];
    var key = left_row[key_to_join];
    var right_row = table_to_hash[key];

    $.each(select, function(k, col) {
      new_row[col] =
        (left_row[col] ? left_row[col] : right_row[col])
    });

    output[idx++] = new_row;
  }
  
  return output;
}

hash_join(
  schema.tables.people, 
  schema.tables.companies, 
  ["company", "id"], 
  ["last_name", "company_name"])

A first variation on this technique is a symmetric hash join. Note that this implementation is overly simplistic - I'm assuming that each ID column is a primary key in it's table, rather than an item that could have many distinct values.

The great thing about this type of join is that you can do it entirely from the indexes, rather than the table data, so there is potentially much less to read into memory. If values are repeated many times, this will be extremely fast- in this case it pays for the database to sample the values to maintain a histogram.

function symmetric_hash_join(a, b, keys, select) {
  var lookup_a = {};
  var lookup_b = {};

  for (var i = 0; i < a.length; i++)
    lookup_a[a[i][keys[0]]] = a[i];

  for (var i = 0; i < b.length; i++)
    lookup_b[b[i][keys[1]]] = b[i];

  var output = [];
  var idx = 0;
  $.each(lookup_a, function f(key, row_a) {
    var row_b = lookup_b[key];
    if (row_b) {
      var new_row = {}

      $.each(select, function(k, col) {
        if (row_a[col]) new_row[col] = row_a[col]    
      });

      $.each(select, function(k, col) {
        if (row_b[col]) new_row[col] = row_b[col]    
      });

      output[idx++] = new_row
    }
  });

  return output;
}

This join requires all hashmaps fit into memory, often an invalid assumption, especially when choosing large tables. An alternate approach is to hash records, then use the hash to partition the data (e.g. ranges or the most significant digits) - this allows you to load two blocks of each table that are guaranteed to contain the data, then do the join. For this post I stuck to simple implementations, but if you want to see how the memory management piece would work check out Scaling a DataFrame in Javascript.

A final approach is the sort-merge join. This works well if you know that your data is sorted ahead of time. This takes somewhere between max(m, n) and m+n operations, and an the advantage that you can stream the records - at most only one row is required in memory at a time.

function sort_merge_join(a, b, keys, select) {
   var sorted_a = a.sort(function f(x, y) {
     return 
       x[keys[0]] === y[keys[1]] ? 0 :
       x[keys[0]] > y[keys[1]] ? 1 : -1
   });

   var sorted_b = b.sort(function f(x, y) {
     return 
       x[keys[0]] === y[keys[1]] ? 0 :
       x[keys[0]] > y[keys[1]] ? 1 : -1
   });

   var a_pos = 0;
   var b_pos = 0;
   
   var a_len = a.length;
   var b_len = b.length;

   var output = [];
   var idx = 0;

   while (a_pos < a_len && b_pos < b_len) {
    var row_a = a[a_pos];
    var row_b = b[b_pos];

    var id_a = row_a[keys[0]];
    var id_b = row_b[keys[1]];
    if (id_a === id_b) {
      var new_row = {};
      $.each(select, function(k, col) {
        if (row_a[col]) new_row[col] = row_a[col]    
      });

      $.each(select, function(k, col) {
        if (row_b[col]) new_row[col] = row_b[col]    
      });
      
      output[idx++] = new_row; 

      a_pos++;
      b_pos++;
    } else if (id_a < id_b) {
      a_pos++;
    } else {
      b_pos++;
    }
  }

  return output;
}
Exit mobile version