/*
* Copyright 2015 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var Utils = require('../utils.js');
var DataFrame = require('./DataFrame.js');
/**
* Statistic functions for {@link DataFrame}s.
* @constructor
* @memberof module:eclairjs/sql
* @since EclairJS 0.1 Spark 1.4.0
* @classdesc
*/
function DataFrameStatFunctions(kernelP, refIdP) {
this.kernelP = kernelP;
this.refIdP = refIdP;
}
/**
* Calculate the sample covariance of two numerical columns of a DataFrame.
* @param {string} col1 the name of the first column
* @param {string} col2 the name of the second column
*
* @example
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.cov("rand1", "rand2")
* res1: Double = 0.065...
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {Promise.<number>} the covariance of the two columns.
*/
DataFrameStatFunctions.prototype.cov = function(col1, col2) {
var args = {
target: this,
method: 'cov',
args: Utils.wrapArguments(arguments),
returnType: Number
};
return Utils.generate(args);
};
/**
* Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
* Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
* MLlib's Statistics.
*
* @param {string} col1 the name of the column
* @param {string} col2 the name of the column to calculate the correlation against
* @param {string} [method] currently only supports the "pearson"
* @example
* var stat = peopleDataFrame.stat().cov("income", "networth", "pearson");
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {number} The Pearson Correlation Coefficient.
*/
DataFrameStatFunctions.prototype.corr = function(col1, col2, method) {
var args = {
target: this,
method: 'corr',
args: Utils.wrapArguments(arguments),
returnType: Number
};
return Utils.generate(args);
};
/**
* Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
* The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
* pair frequencies will be returned.
* The first column of each row will be the distinct values of `col1` and the column names will
* be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
* will be returned as `Long`s. Pairs that have no occurrences will have zero as their counts.
* Null elements will be replaced by "null", and back ticks will be dropped from elements if they
* exist.
*
*
* @param {string} col1 The name of the first column. Distinct items will make the first item of
* each row.
* @param {string} col2 The name of the second column. Distinct items will make the column names
* of the DataFrame.
*
* @example
* val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
* (3, 3))).toDF("key", "value")
* val ct = df.stat.crosstab("key", "value")
* ct.show()
* +---------+---+---+---+
* |key_value| 1| 2| 3|
* +---------+---+---+---+
* | 2| 2| 0| 1|
* | 1| 1| 1| 0|
* | 3| 0| 1| 1|
* +---------+---+---+---+
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {module:eclairjs/sql.DataFrame} A DataFrame containing for the contingency table.
*/
DataFrameStatFunctions.prototype.crosstab = function(col1, col2) {
var args = {
target: this,
method: 'crosstab',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Finding frequent items for columns, possibly with false positives. Using the
* frequent element count algorithm described in
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
* The `support` should be greater than 1e-4.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting {@link DataFrame}.
*
* @param {string[]} cols the names of the columns to search frequent items in.
* @param {number} support Optional The minimum frequency for an item to be considered `frequent`. Should be greater
* than 1e-4. defaults to 1% (0.01)
*
* @example
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* var freqSingles = df.stat.freqItems(["a", "b"]), 0.4)
* freqSingles.show()
* +-----------+-------------+
* |a_freqItems| b_freqItems|
* +-----------+-------------+
* | [1, 99]|[-1.0, -99.0]|
* +-----------+-------------+
*
*
* @since EclairJS 0.1 Spark 1.4.0
* @returns {module:eclairjs/sql.DataFrame} A Local DataFrame with the Array of frequent items for each column.
*/
DataFrameStatFunctions.prototype.freqItems = function(cols, support) {
var args = {
target: this,
method: 'freqItems',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Returns a stratified sample without replacement based on the fraction given on each stratum.
* @param {string} col column that defines strata
* @param {object} fractions is expected to be a HashMap, the key of the map is the column name, and the value of the map is the replacement value.
* The value must be of the following type: `number`or `String`.
* @param {integer} seed random seed
*
* @example
* var df = sqlContext.createDataFrame([[1,1], [1,2], [2,1], [2,1], [2,3], [3,2], [3,3]], schema).toDF("key", "value");
* var fractions = {"1": 1.0, "3": 0.5);
* df.stat.sampleBy("key", fractions, 36L).show()
* +---+-----+
* |key|value|
* +---+-----+
* | 1| 1|
* | 1| 2|
* | 3| 2|
* +---+-----+
*
*
* @since EclairJS 0.1 Spark 1.5.0
* @returns {module:eclairjs/sql.DataFrame} a new [[DataFrame]] that represents the stratified sample
*/
DataFrameStatFunctions.prototype.sampleBy = function(col, fractions, seed) {
var args = {
target: this,
method: 'sampleBy',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
DataFrameStatFunctions.moduleLocation = '/sql/DataFrameStatFunctions';
module.exports = DataFrameStatFunctions;