/*
* Copyright 2016 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var Utils = require('../../utils.js');
var RandomForestModel = require('./model/RandomForestModel.js')();
var gKernelP;
/**
* A class that implements a [[http://en.wikipedia.org/wiki/Random_forest Random Forest]]
* learning algorithm for classification and regression.
* It supports both continuous and categorical features.
* @constructor
* @memberof module:eclairjs/mllib/tree
* @classdesc
* The settings for featureSubsetStrategy are based on the following references:
* - log2: tested in Breiman (2001)
* - sqrt: recommended by Breiman manual for random forests
* - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
* package.
* [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf Breiman (2001)]]
* [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf Breiman manual for
* random forests]]
*
* @param {module:eclairjs/mllib/tree/configuration.Strategy} strategy The configuration parameters for the random forest algorithm which specify
* the type of algorithm (classification, regression, etc.), feature type
* (continuous, categorical), depth of the tree, quantile calculation strategy,
* etc.
* @param {Number} numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done.
* @param featureSubsetStrategy Number of features to consider for splits at each node.
* Supported: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
* if numTrees > 1 (forest) set to "sqrt" for classification and
* to "onethird" for regression.
* @param seed Random seed for bootstrapping and choosing feature subsets.
*/
function RandomForest() {
Utils.handleConstructor(this, arguments, gKernelP);
}
//
// static methods
//
/**
* Method to train a decision tree model for binary or multiclass classification.
*
* @param {module:eclairjs/rdd.RDD} input Training dataset: RDD of {@link LabeledPoint}.
* Labels should take values {0, 1, ..., numClasses-1}.
* @param {number} numClasses number of classes for classification.
* @param {Map} categoricalFeaturesInfo Map storing arity of categorical features.
* E.g., an entry (n -> k) indicates that feature n is categorical
* with k categories indexed from 0: {0, 1, ..., k-1}.
* @param {number} numTrees Number of trees in the random forest.
* @param {string} featureSubsetStrategy Number of features to consider for splits at each node.
* Supported: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
* if numTrees > 1 (forest) set to "sqrt".
* @param {string} impurity Criterion used for information gain calculation.
* Supported values: "gini" (recommended) or "entropy".
* @param {number} maxDepth Maximum depth of the tree.
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
* (suggested value: 4)
* @param {number} maxBins maximum number of bins used for splitting features
* (suggested value: 100)
* @param {number} seed Random seed for bootstrapping and choosing feature subsets.
* @returns {RandomForestModel} a random forest model that can be used for prediction
*/
RandomForest.trainClassifier = function(input,numClasses,categoricalFeaturesInfo,numTrees,featureSubsetStrategy,impurity,maxDepth,maxBins,seed) {
var args = {
target: RandomForest,
method: 'trainClassifier',
args: Utils.wrapArguments(arguments),
static: true,
kernelP: gKernelP,
returnType: RandomForestModel
};
return Utils.generate(args);
};
/**
* Method to train a decision tree model for regression.
*
* @param {module:eclairjs/rdd.RDD} input Training dataset: RDD of {@link LabeledPoint}.
* Labels are real numbers.
* @param {Map} categoricalFeaturesInfo Map storing arity of categorical features.
* E.g., an entry (n -> k) indicates that feature n is categorical
* with k categories indexed from 0: {0, 1, ..., k-1}.
* @param {number} numTrees Number of trees in the random forest.
* @param {string} featureSubsetStrategy Number of features to consider for splits at each node.
* Supported: "auto", "all", "sqrt", "log2", "onethird".
* If "auto" is set, this parameter is set based on numTrees:
* if numTrees == 1, set to "all";
* if numTrees > 1 (forest) set to "onethird".
* @param {string} impurity Criterion used for information gain calculation.
* Supported values: "variance".
* @param {number} maxDepth Maximum depth of the tree.
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
* (suggested value: 4)
* @param {number} maxBins maximum number of bins used for splitting features
* (suggested value: 100)
* @param {number} seed Random seed for bootstrapping and choosing feature subsets.
* @returns {RandomForestModel} a random forest model that can be used for prediction
*/
RandomForest.trainRegressor = function(input,categoricalFeaturesInfo,numTrees,featureSubsetStrategy,impurity,maxDepth,maxBins,seed) {
var args = {
target: RandomForest,
method: 'trainRegressor',
args: Utils.wrapArguments(arguments),
static: true,
kernelP: gKernelP,
returnType: RandomForestModel
};
return Utils.generate(args);
};
RandomForest.moduleLocation = '/mllib/tree/RandomForest';
module.exports = function(kP) {
if (kP) gKernelP = kP;
return RandomForest;
};