Source: mllib/clustering/KMeans.js

/*
 * Copyright 2016 IBM Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

var Utils = require('../../utils.js');
var RDD = require('../../rdd/RDD.js');

var KMeansModel = require('./KMeansModel.js')();

var gKernelP;

/**
 * K-means clustering with support for multiple parallel runs and a k-means++ like initialization
 * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
 * they are executed together with joint passes over the data for efficiency.
 *
 * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
 * to it should be cached by the user.
 * @classdesc
 */

/**
 * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
     * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
 * @class
 * @memberof module:eclairjs/mllib/clustering
 */
function KMeans(kernelP, refIdP) {
  Utils.handleConstructor(this, arguments, gKernelP);
}

/**
 * Number of clusters to create (k).
 * @returns {Promise.<number>}
 */
KMeans.prototype.getK = function() {
  throw "not implemented by ElairJS";
};

/**
 * Set the number of clusters to create (k). Default: 2.
 * @param {number} k
 * @returns {}
 */
KMeans.prototype.setK = function(k) {
  throw "not implemented by ElairJS";
};

/**
 * Maximum number of iterations to run.
 * @returns {Promise.<number>}
 */
KMeans.prototype.getMaxIterations = function() {
  throw "not implemented by ElairJS";
};

/**
 * Set maximum number of iterations to run. Default: 20.
 * @param {number} maxIterations
 * @returns {}
 */
KMeans.prototype.setMaxIterations = function(maxIterations) {
  throw "not implemented by ElairJS";
};

/**
 * The initialization algorithm. This can be either "random" or "k-means||".
 * @returns {Promise.<string>}
 */
KMeans.prototype.getInitializationMode = function() {
  throw "not implemented by ElairJS";
};

/**
 * Set the initialization algorithm. This can be either "random" to choose random points as
 * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
 * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
 * @param {string} initializationMode
 * @returns {}
 */
KMeans.prototype.setInitializationMode = function(initializationMode) {
  throw "not implemented by ElairJS";
};

/**
 * :: Experimental ::
 * Number of runs of the algorithm to execute in parallel.
 * @returns {Promise.<number>}
 */
KMeans.prototype.getRuns = function() {
  throw "not implemented by ElairJS";
};

/**
 * :: Experimental ::
 * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
 * this many times with random starting conditions (configured by the initialization mode), then
 * return the best clustering found over any run. Default: 1.
 * @param {number} runs
 * @returns {}
 */
KMeans.prototype.setRuns = function(runs) {
  throw "not implemented by ElairJS";
};

/**
 * Number of steps for the k-means|| initialization mode
 * @returns {Promise.<number>}
 */
KMeans.prototype.getInitializationSteps = function() {
  throw "not implemented by ElairJS";
};

/**
 * Set the number of steps for the k-means|| initialization mode. This is an advanced
 * setting -- the default of 5 is almost always enough. Default: 5.
 * @param {number} initializationSteps
 * @returns {}
 */
KMeans.prototype.setInitializationSteps = function(initializationSteps) {
  throw "not implemented by ElairJS";
};

/**
 * The distance threshold within which we've consider centers to have converged.
 * @returns {Promise.<number>}
 */
KMeans.prototype.getEpsilon = function() {
  throw "not implemented by ElairJS";
};

/**
 * Set the distance threshold within which we've consider centers to have converged.
 * If all centers move less than this Euclidean distance, we stop iterating one run.
 * @param {number} epsilon
 * @returns {}
 */
KMeans.prototype.setEpsilon = function(epsilon) {
  throw "not implemented by ElairJS";
};

/**
 * The random seed for cluster initialization.
 * @returns {Promise.<number>}
 */
KMeans.prototype.getSeed = function() {
  throw "not implemented by ElairJS";
};

/**
 * Set the random seed for cluster initialization.
 * @param {number} seed
 * @returns {}
 */
KMeans.prototype.setSeed = function(seed) {
  throw "not implemented by ElairJS";
};

/**
 * Set the initial starting point, bypassing the random initialization or k-means||
 * The condition model.k == this.k must be met, failure results
 * in an IllegalArgumentException.
 * @param {module:eclairjs/mllib/clustering.KMeansModel} model
 * @returns {}
 */
KMeans.prototype.setInitialModel = function(model) {
  throw "not implemented by ElairJS";
};

/**
 * Train a K-means model on the given set of points; `data` should be cached for high
 * performance, because this is an iterative algorithm.
 * @param {module:eclairjs/rdd.RDD} data
 * @returns {module:eclairjs/mllib/clustering.KMeansModel}
 */
KMeans.prototype.run = function(data) {
  throw "not implemented by ElairJS";
};

//
// static methods
//

/**
 * Trains a k-means model using the given set of parameters.
 *
 * @param {module:eclairjs/rdd.RDD} data  training points stored as `RDD[Vector]`
 * @param {number} k  number of clusters
 * @param {number} maxIterations  max number of iterations
 * @param {number} [runs] number of parallel runs, defaults to 1. The best model is returned.
 * @param {string} [initializationMode] initialization model, either "random" or "k-means||" (default).
 * @param {number} [seed] random seed value for cluster initialization
 * @returns {module:eclairjs/mllib/clustering.KMeansModel}
 */
KMeans.train = function(data,k,maxIterations,runs,initializationMode,seed) {
  var args = [{value: data}, {value: k, type: 'number'}, {value: maxIterations, type: 'number'}];

  if (runs) {
    args.push({value: runs, type: 'number'})
  }

  if (initializationMode) {
    args.push({value: initializationMode})
  }

  if (seed) {
    args.push({value: seed})
  }

  var gargs = {
    target: KMeans,
    method: 'train',
    kernelP: gKernelP,
    static: true,
    args: args,
    returnType: KMeansModel
  };

  return Utils.generate(gargs);
};

KMeans.K_MEANS_PARALLEL = 'KMeans.K_MEANS_PARALLEL';
KMeans.RANDOM = 'KMeans.RANDOM';

KMeans.moduleLocation = '/mllib/clustering/KMeans';

module.exports = function(kP) {
  if (kP) gKernelP = kP;

  return KMeans;
};