/*
* Copyright 2015 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
(function () {
var JavaWrapper = require(EclairJS_Globals.NAMESPACE + '/JavaWrapper');
var Utils = require(EclairJS_Globals.NAMESPACE + '/Utils');
var Logger = require(EclairJS_Globals.NAMESPACE + '/Logger');
var logger = Logger.getLogger("DataFrameStatFunctions_js");
/**
* Statistic functions for {@link DataFrame}s.
* @constructor
* @memberof module:eclairjs/sql
* @since EclairJS 0.1 Spark 1.4.0
* @classdesc
*/
var DataFrameStatFunctions = function (jvmObject) {
JavaWrapper.call(this, jvmObject);
};
DataFrameStatFunctions.prototype = Object.create(JavaWrapper.prototype);
DataFrameStatFunctions.prototype.constructor = DataFrameStatFunctions;
/**
* Calculate the sample covariance of two numerical columns of a DataFrame.
* @param {string} col1 the name of the first column
* @param {string} col2 the name of the second column
*
* @example
* var stat = peopleDataFrame.stat().cov("income", "networth");
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {number} the covariance of the two columns.
*/
DataFrameStatFunctions.prototype.cov = function (col1, col2) {
return this.getJavaObject().cov(col1, col2);
}
/**
* Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
* Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
* MLlib's Statistics.
*
* @param {string} col1 the name of the column
* @param {string} col2 the name of the column to calculate the correlation against
* @param {string} [method] currently only supports the "pearson"
* @example
* var stat = peopleDataFrame.stat().cov("income", "networth", "pearson");
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {number} The Pearson Correlation Coefficient.
*/
DataFrameStatFunctions.prototype.corr = function (col1, col2, method) {
if (method) {
return this.getJavaObject().corr(col1, col2, method);
} else {
return this.getJavaObject().corr(col1, col2);
}
}
/**
* Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
* The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
* pair frequencies will be returned.
* The first column of each row will be the distinct values of `col1` and the column names will
* be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
* will be returned as `Long`s. Pairs that have no occurrences will have zero as their counts.
* Null elements will be replaced by "null", and back ticks will be dropped from elements if they
* exist.
*
*
* @param {string} col1 The name of the first column. Distinct items will make the first item of
* each row.
* @param {string} col2 The name of the second column. Distinct items will make the column names
* of the DataFrame.
*
* @example
* var df = sqlContext.createDataFrame([[1,1], [1,2], [2,1], [2,1], [2,3], [3,2], [3,3]], schema);
* var ct = df.stat().crosstab("key", "value");
* ct.show();
* +---------+---+---+---+
* |key_value| 1| 2| 3|
* +---------+---+---+---+
* | 2| 2| 0| 1|
* | 1| 1| 1| 0|
* | 3| 0| 1| 1|
* +---------+---+---+---+
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {module:eclairjs/sql.DataFrame} A DataFrame containing for the contingency table.
*/
DataFrameStatFunctions.prototype.crosstab = function (col1, col2) {
var javaObject = this.getJavaObject().crosstab(col1, col2);
return Utils.javaToJs(javaObject);
}
/**
* Finding frequent items for columns, possibly with false positives. Using the
* frequent element count algorithm described in
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
* The `support` should be greater than 1e-4.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting {@link DataFrame}.
*
* @param {string[]} cols the names of the columns to search frequent items in.
* @param {number} support The minimum frequency for an item to be considered `frequent`. Should be greater
* than 1e-4. defaults to 1% (0.01)
*
* @example
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* var freqSingles = df.stat.freqItems(["a", "b"]), 0.4)
* freqSingles.show()
* +-----------+-------------+
* |a_freqItems| b_freqItems|
* +-----------+-------------+
* | [1, 99]|[-1.0, -99.0]|
* +-----------+-------------+
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {module:eclairjs/sql.DataFrame} A Local DataFrame with the Array of frequent items for each column.
*/
DataFrameStatFunctions.prototype.freqItems = function (cols, support) {
var javaObject;
if (support) {
javaObject = this.getJavaObject().freqItems(cols, support);
} else {
javaObject = this.getJavaObject().freqItems(cols);
}
return Utils.javaToJs(javaObject);
}
/**
* Returns a stratified sample without replacement based on the fraction given on each stratum.
* @param {string} col column that defines strata
* @param {object} fractions is expected to be a HashMap, the key of the map is the column name, and the value of the map is the replacement value.
* The value must be of the following type: `number`or `String`.
* @param {integer} seed random seed
*
* @example
* var df = sqlContext.createDataFrame([[1,1], [1,2], [2,1], [2,1], [2,3], [3,2], [3,3]], schema).toDF("key", "value");
* var fractions = {"1": 1.0, "3": 0.5);
* df.stat.sampleBy("key", fractions, 36L).show()
* +---+-----+
* |key|value|
* +---+-----+
* | 1| 1|
* | 1| 2|
* | 3| 2|
* +---+-----+
*
*
* @since EclairJS 0.1 Spark 1.5.0
* @returns {module:eclairjs/sql.DataFrame} a new [[DataFrame]] that represents the stratified sample
*/
DataFrameStatFunctions.prototype.sampleBy = function (col, fractions, seed) {
var fractions_uw = Utils.createJavaHashMap(fractions);
var javaObject = this.getJavaObject().sampleBy(col, fractions_uw, seed);
return Utils.javaToJs(javaObject);
}
module.exports = DataFrameStatFunctions;
})();