Source: eclairjs/mllib/clustering/KMeans.js

/*                                                                         
 * Copyright 2016 IBM Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
(function () {

    var JavaWrapper = require(EclairJS_Globals.NAMESPACE + '/JavaWrapper');
    var Logger = require(EclairJS_Globals.NAMESPACE + '/Logger');
    var Utils = require(EclairJS_Globals.NAMESPACE + '/Utils');

    var KMeansModel = require(EclairJS_Globals.NAMESPACE + '/mllib/clustering/KMeansModel');

    /**
     * K-means clustering with support for multiple parallel runs and a k-means++ like initialization
     * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
     * they are executed together with joint passes over the data for efficiency.
     *
     * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
     * to it should be cached by the user.
     * @classdesc
     */

    /**
     * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
     * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
     * @class
     * @memberof module:eclairjs/mllib/clustering
     */
    var KMeans = function (jvmObject) {

        this.logger = Logger.getLogger("KMeans_js");
        JavaWrapper.call(this, jvmObject);

    };

    KMeans.prototype = Object.create(JavaWrapper.prototype);

    KMeans.prototype.constructor = KMeans;


    /**
     * Number of clusters to create (k).
     * @returns {number}
     */
    KMeans.prototype.getK = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getK();
    };


    /**
     * Set the number of clusters to create (k). Default: 2.
     * @param {number} k
     * @returns {}
     */
    KMeans.prototype.setK = function (k) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setK(k);
    //   return new (javaObject);
    };


    /**
     * Maximum number of iterations to run.
     * @returns {number}
     */
    KMeans.prototype.getMaxIterations = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getMaxIterations();
    };


    /**
     * Set maximum number of iterations to run. Default: 20.
     * @param {number} maxIterations
     * @returns {}
     */
    KMeans.prototype.setMaxIterations = function (maxIterations) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setMaxIterations(maxIterations);
    //   return new (javaObject);
    };


    /**
     * The initialization algorithm. This can be either "random" or "k-means||".
     * @returns {string}
     */
    KMeans.prototype.getInitializationMode = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getInitializationMode();
    };


    /**
     * Set the initialization algorithm. This can be either "random" to choose random points as
     * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
     * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
     * @param {string} initializationMode
     * @returns {}
     */
    KMeans.prototype.setInitializationMode = function (initializationMode) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setInitializationMode(initializationMode);
    //   return new (javaObject);
    };


    /**
     * :: Experimental ::
     * Number of runs of the algorithm to execute in parallel.
     * @returns {number}
     */
    KMeans.prototype.getRuns = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getRuns();
    };


    /**
     * :: Experimental ::
     * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
     * this many times with random starting conditions (configured by the initialization mode), then
     * return the best clustering found over any run. Default: 1.
     * @param {number} runs
     * @returns {}
     */
    KMeans.prototype.setRuns = function (runs) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setRuns(runs);
    //   return new (javaObject);
    };


    /**
     * Number of steps for the k-means|| initialization mode
     * @returns {number}
     */
    KMeans.prototype.getInitializationSteps = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getInitializationSteps();
    };


    /**
     * Set the number of steps for the k-means|| initialization mode. This is an advanced
     * setting -- the default of 5 is almost always enough. Default: 5.
     * @param {number} initializationSteps
     * @returns {}
     */
    KMeans.prototype.setInitializationSteps = function (initializationSteps) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setInitializationSteps(initializationSteps);
    //   return new (javaObject);
    };


    /**
     * The distance threshold within which we've consider centers to have converged.
     * @returns {number}
     */
    KMeans.prototype.getEpsilon = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getEpsilon();
    };


    /**
     * Set the distance threshold within which we've consider centers to have converged.
     * If all centers move less than this Euclidean distance, we stop iterating one run.
     * @param {number} epsilon
     * @returns {}
     */
    KMeans.prototype.setEpsilon = function (epsilon) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setEpsilon(epsilon);
    //   return new (javaObject);
    };


    /**
     * The random seed for cluster initialization.
     * @returns {number}
     */
    KMeans.prototype.getSeed = function () {
        throw "not implemented by ElairJS";
    //   return  this.getJavaObject().getSeed();
    };


    /**
     * Set the random seed for cluster initialization.
     * @param {number} seed
     * @returns {}
     */
    KMeans.prototype.setSeed = function (seed) {
        throw "not implemented by ElairJS";
    //   var javaObject =  this.getJavaObject().setSeed(seed);
    //   return new (javaObject);
    };


    /**
     * Set the initial starting point, bypassing the random initialization or k-means||
     * The condition model.k == this.k must be met, failure results
     * in an IllegalArgumentException.
     * @param {KMeansModel} model
     * @returns {}
     */
    KMeans.prototype.setInitialModel = function (model) {
        throw "not implemented by ElairJS";
    //   var model_uw = Utils.unwrapObject(model);
    //   var javaObject =  this.getJavaObject().setInitialModel(model_uw);
    //   return new (javaObject);
    };


    /**
     * Train a K-means model on the given set of points; `data` should be cached for high
     * performance, because this is an iterative algorithm.
     * @param {module:eclairjs.RDD} data
     * @returns {KMeansModel}
     */
    KMeans.prototype.run = function (data) {
        throw "not implemented by ElairJS";
    //   var data_uw = Utils.unwrapObject(data);
    //   var javaObject =  this.getJavaObject().run(data_uw);
    //   return new KMeansModel(javaObject);
    };
    //
    // static methods
    //

    KMeans.K_MEANS_PARALLEL = org.apache.spark.mllib.clustering.KMeans.K_MEANS_PARALLEL(); //string
    KMeans.RANDOM = org.apache.spark.mllib.clustering.KMeans.RANDOM(); //string
    /**
     * Trains a k-means model using the given set of parameters.
     *
     * @param {module:eclairjs.RDD} data  training points stored as `RDD[Vector]`
     * @param {number} k  number of clusters
     * @param {number} maxIterations  max number of iterations
     * @param {number} [runs] number of parallel runs, defaults to 1. The best model is returned.
     * @param {string} [initializationMode] initialization model, either "random" or "k-means||" (default).
     * @param {number} [seed] random seed value for cluster initialization
     * @returns {KMeansModel}
     */
    KMeans.train = function (data, k, maxIterations, runs, initializationMode, seed) {

        var data_uw = Utils.unwrapObject(data).rdd();
        var javaObject;
        if (seed) {
            javaObject = org.apache.spark.mllib.clustering.KMeans.train(data_uw, k, maxIterations, runs, initializationMode, seed);
        } else if (initializationMode) {
            javaObject = org.apache.spark.mllib.clustering.KMeans.train(data_uw, k, maxIterations, runs, initializationMode);
        } else if (runs) {
            javaObject = org.apache.spark.mllib.clustering.KMeans.train(data_uw, k, maxIterations, runs);
        } else {
            javaObject = org.apache.spark.mllib.clustering.KMeans.train(data_uw, k, maxIterations);
        }
        return new KMeansModel(javaObject);
    };

    module.exports = KMeans;

})();