Source: eclairjs/ml/feature/VectorIndexer.js

/*                                                                         
 * Copyright 2016 IBM Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

(function () {

    var PipelineStage = require(EclairJS_Globals.NAMESPACE + '/ml/PipelineStage');
    var Logger = require(EclairJS_Globals.NAMESPACE + '/Logger');
    var Utils = require(EclairJS_Globals.NAMESPACE + '/Utils');


    /**
     * @classdesc
     * Class for indexing categorical feature columns in a dataset of {@link Vector}.
     *
     * This has 2 usage modes:
     *  - Automatically identify categorical features (default behavior)
     *     - This helps process a dataset of unknown vectors into a dataset with some continuous
     *       features and some categorical features. The choice between continuous and categorical
     *       is based upon a maxCategories parameter.
     *     - Set maxCategories to the maximum number of categorical any categorical feature should have.
     *     - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
     *       If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},
     *       and feature 1 will be declared continuous.
     *  - Index all features, if all features are categorical
     *     - If maxCategories is set to be very large, then this will build an index of unique
     *       values for all features.
     *     - Warning: This can cause problems if features are continuous since this will collect ALL
     *       unique values to the driver.
     *     - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
     *       If maxCategories >= 3, then both features will be declared categorical.
     *
     * This returns a model which can transform categorical features to use 0-based indices.
     *
     * Index stability:
     *  - This is not guaranteed to choose the same category index across multiple runs.
     *  - If a categorical feature includes value 0, then this is guaranteed to map value 0 to index 0.
     *    This maintains vector sparsity.
     *  - More stability may be added in the future.
     *
     * TODO: Future extensions: The following functionality is planned for the future:
     *  - Preserve metadata in transform; if a feature's metadata is already present, do not recompute.
     *  - Specify certain features to not index, either via a parameter or via existing metadata.
     *  - Add warning if a categorical feature has only 1 category.
     *  - Add option for allowing unknown categories.
     * @class
     * @extends module:eclairjs/ml.PipelineStage
     * @memberof module:eclairjs/ml/feature
     * @param {string} [uid]
     */
    var VectorIndexer = function (uid) {
        this.logger = Logger.getLogger("ml.feature.VectorIndexer_js");
        var jvmObject;
        if (uid) {
            if (uid instanceof org.apache.spark.ml.feature.VectorIndexer) {
                jvmObject = uid;
            } else {
                jvmObject = new org.apache.spark.ml.feature.VectorIndexer(uid);
            }
        } else {
            jvmObject = new org.apache.spark.ml.feature.VectorIndexer();
        }
        PipelineStage.call(this, jvmObject);

    };

    VectorIndexer.prototype = Object.create(PipelineStage.prototype);

    VectorIndexer.prototype.constructor = VectorIndexer;

    /**
     * An immutable unique ID for the object and its derivatives.
     * @returns {string}
     */
    VectorIndexer.prototype.uid = function () {
        return this.getJavaObject().uid();
    };


    /**
     * @param {integer} value
     * @returns {module:eclairjs/ml/feature.VectorIndexer}
     */
    VectorIndexer.prototype.setMaxCategories = function (value) {
        var javaObject = this.getJavaObject().setMaxCategories(value);
        return new VectorIndexer(javaObject);
    };


    /**
     * @param {string} value
     * @returns {module:eclairjs/ml/feature.VectorIndexer}
     */
    VectorIndexer.prototype.setInputCol = function (value) {
        var javaObject = this.getJavaObject().setInputCol(value);
        return new VectorIndexer(javaObject);
    };


    /**
     * @param {string} value
     * @returns {module:eclairjs/ml/feature.VectorIndexer}
     */
    VectorIndexer.prototype.setOutputCol = function (value) {
        var javaObject = this.getJavaObject().setOutputCol(value);
        return new VectorIndexer(javaObject);
    };


    /**
     * @param {module:eclairjs/sql.Dataset} dataset
     * @returns {module:eclairjs/ml/feature.VectorIndexerModel}
     */
    VectorIndexer.prototype.fit = function (dataset) {
        var dataset_uw = Utils.unwrapObject(dataset);
        var javaObject = this.getJavaObject().fit(dataset_uw);
        return Utils.javaToJs(javaObject);
    };


    /**
     * @param {module:eclairjs/sql/types.StructType} schema
     * @returns {module:eclairjs/sql/types.StructType}
     */
    VectorIndexer.prototype.transformSchema = function (schema) {
        var schema_uw = Utils.unwrapObject(schema);
        var javaObject = this.getJavaObject().transformSchema(schema_uw);
        return Utils.javaToJs(javaObject);
    };


    /**
     * @param {module:eclairjs/ml/param.ParamMap} extra
     * @returns {module:eclairjs/ml/feature.VectorIndexer}
     */
    VectorIndexer.prototype.copy = function (extra) {
        var extra_uw = Utils.unwrapObject(extra);
        var javaObject = this.getJavaObject().copy(extra_uw);
        return new VectorIndexer(javaObject);
    };

    /**
     * @returns {module:eclairjs/ml/param.IntParam}
     */
    VectorIndexer.prototype.maxCategories = function () {
        var javaObject = this.getJavaObject().maxCategories();
        return Utils.javaToJs(javaObject);
    };

    /**
     * @returns {integer}
     */
    VectorIndexer.prototype.getMaxCategories = function () {
        return this.getJavaObject().getMaxCategories();
    };

    //
    // static methods
    //


    /**
     * @param {string} path
     * @returns {module:eclairjs/ml/feature.VectorIndexer}
     */
    VectorIndexer.load = function (path) {
        var javaObject = org.apache.spark.ml.feature.VectorIndexer.load(path);
        return new VectorIndexer(javaObject);
    };

    module.exports = VectorIndexer;
})();