Source: mllib/linalg/distributed/RowMatrix.js

/*
 * Copyright 2016 IBM Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

var Utils = require('../../../utils.js');
var RDD = require('../../../rdd/RDD.js');
var Matrix = require('../Matrix.js');
var SingularValueDecomposition = require('../SingularValueDecomposition.js')();

var gKernelP;

/**
 * Represents a row-oriented distributed Matrix with no meaningful row indices.
 * @memberof module:eclairjs/mllib/linalg/distributed
 * @classdesc
 * @param {module:eclairjs/rdd.RDD} rows stored as an RDD[Vector]
 * @param {number} [nRows] number of rows. A non-positive value means unknown, and then the number of rows will
 *              be determined by the number of records in the RDD `rows`.
 * @param {number} [nCols] number of columns. A non-positive value means unknown, and then the number of
 *              columns will be determined by the size of the first row.
 * @class
 * @extends {DistributedMatrix}
 * @example
 * var RowMatrix = require('eclairjs/mllib/linalg/distributed/RowMatrix');
 * var Vectors = require("'eclairjs/mllib/linalg/Vectors");;
 * var rowsList = [Vectors.dense([1.12, 2.05, 3.12]), Vectors.dense([5.56, 6.28, 8.94]), Vectors.dense([10.2, 8.0, 20.5])];
 * var rows = sc.parallelize(rowsList);
 * var mat = new RowMatrix(rows);
 */
function RowMatrix() {
  Utils.handleConstructor(this, arguments, gKernelP);
}

/**
 * Computes the Gramian matrix `A^T A`. Note that this cannot be computed on matrices with
 * more than 65535 columns.
 * @returns {module:eclairjs/mllib/linalg.Matrix}
 */
RowMatrix.prototype.computeGramianMatrix = function() {
  throw "not implemented by ElairJS";
//   var args ={
//     target: this,
//     method: 'computeGramianMatrix',
//     returnType: Matrix
//
//   };
//
//   return Utils.generate(args);
};


/**
 * Computes singular value decomposition of this matrix. Denote this matrix by A (m x n). This
 * will compute matrices U, S, V such that A ~= U * S * V', where S contains the leading k
 * singular values, U and V contain the corresponding singular vectors.
 *
 * At most k largest non-zero singular values and associated vectors are returned. If there are k
 * such values, then the dimensions of the return will be:
 *  - U is a RowMatrix of size m x k that satisfies U' * U = eye(k),
 *  - s is a Vector of size k, holding the singular values in descending order,
 *  - V is a Matrix of size n x k that satisfies V' * V = eye(k).
 *
 * We assume n is smaller than m, though this is not strictly required.
 * The singular values and the right singular vectors are derived
 * from the eigenvalues and the eigenvectors of the Gramian matrix A' * A. U, the matrix
 * storing the right singular vectors, is computed via matrix multiplication as
 * U = A * (V * S^-1^), if requested by user. The actual method to use is determined
 * automatically based on the cost:
 *  - If n is small (n < 100) or k is large compared with n (k > n / 2), we compute
 *    the Gramian matrix first and then compute its top eigenvalues and eigenvectors locally
 *    on the driver. This requires a single pass with O(n^2^) storage on each executor and
 *    on the driver, and O(n^2^ k) time on the driver.
 *  - Otherwise, we compute (A' * A) * v in a distributive way and send it to ARPACK's DSAUPD to
 *    compute (A' * A)'s top eigenvalues and eigenvectors on the driver node. This requires O(k)
 *    passes, O(n) storage on each executor, and O(n k) storage on the driver.
 *
 * Several internal parameters are set to default values. The reciprocal condition number rCond
 * is set to 1e-9. All singular values smaller than rCond * sigma(0) are treated as zeros, where
 * sigma(0) is the largest singular value. The maximum number of Arnoldi update iterations for
 * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK's
 * eigen-decomposition is set to 1e-10.
 *
 * @note The conditions that decide which method to use internally and the default parameters are
 *       subject to change.
 *
 * @param {number} k  number of leading singular values to keep (0 < k <= n).
 *          It might return less than k if
 *          there are numerically zero singular values or there are not enough Ritz values
 *          converged before the maximum number of Arnoldi update iterations is reached (in case
 *          that matrix A is ill-conditioned).
 * @param {boolean} computeU  whether to compute U
 * @param {number} rCond  the reciprocal condition number. All singular values smaller than rCond * sigma(0)
 *              are treated as zero, where sigma(0) is the largest singular value.
 * @returns {module:eclairjs/mllib/linalg.SingularValueDecomposition}  SingularValueDecomposition(U, s, V). U = null if computeU = false.
 */
RowMatrix.prototype.computeSVD = function(k,computeU,rCond) {
  var args = {
    target: this,
    method: 'computeSVD',
    args: Utils.wrapArguments(arguments),
    returnType: SingularValueDecomposition
  };

  return Utils.generate(args);
};

/**
 * Computes the covariance matrix, treating each row as an observation. Note that this cannot
 * be computed on matrices with more than 65535 columns.
 * @returns {module:eclairjs/mllib/linalg.Matrix}  a local dense matrix of size n x n
 */
RowMatrix.prototype.computeCovariance = function() {
  throw "not implemented by ElairJS";
//   var args ={
//     target: this,
//     method: 'computeCovariance',
//     returnType: Matrix
//
//   };
//
//   return Utils.generate(args);
};


/**
 * Computes the top k principal components.
 * Rows correspond to observations and columns correspond to variables.
 * The principal components are stored a local matrix of size n-by-k.
 * Each column corresponds for one principal component,
 * and the columns are in descending order of component variance.
 * The row data do not need to be "centered" first; it is not necessary for
 * the mean of each column to be 0.
 *
 * Note that this cannot be computed on matrices with more than 65535 columns.
 *
 * @param {number} k  number of top principal components.
 * @returns {module:eclairjs/mllib/linalg.Matrix}  a matrix of size n-by-k, whose columns are principal components
 */
RowMatrix.prototype.computePrincipalComponents = function(k) {
  var args = {
    target: this,
    method: 'computePrincipalComponents',
    args: Utils.wrapArguments(arguments),
    returnType: Matrix
  };

  return Utils.generate(args);
};


/**
 * Computes column-wise summary statistics.
 * @returns {MultivariateStatisticalSummary}
 */
RowMatrix.prototype.computeColumnSummaryStatistics = function() {
  throw "not implemented by ElairJS";
//   var args ={
//     target: this,
//     method: 'computeColumnSummaryStatistics',
//     returnType: MultivariateStatisticalSummary
//
//   };
//
//   return Utils.generate(args);
};


/**
 * Multiply this matrix by a local matrix on the right.
 *
 * @param {module:eclairjs/mllib/linalg.Matrix} B  a local matrix whose number of rows must match the number of columns of this matrix
 *         which preserves partitioning
 * @returns {module:eclairjs/mllib/linalg/distributed.RowMatrix}  a [[org.apache.spark.mllib.linalg.distributed.RowMatrix]] representing the product,
 */
RowMatrix.prototype.multiply = function(B) {
  var args = {
    target: this,
    method: 'multiply',
    args: Utils.wrapArguments(arguments),
    returnType: RowMatrix
  };

  return Utils.generate(args);
};


/**
 * Compute similarities between columns of this matrix using a sampling approach.
 *
 * The threshold parameter is a trade-off knob between estimate quality and computational cost.
 *
 * Setting a threshold of 0 guarantees deterministic correct results, but comes at exactly
 * the same cost as the brute-force approach. Setting the threshold to positive values
 * incurs strictly less computational cost than the brute-force approach, however the
 * similarities computed will be estimates.
 *
 * The sampling guarantees relative-error correctness for those pairs of columns that have
 * similarity greater than the given similarity threshold.
 *
 * To describe the guarantee, we set some notation:
 * Let A be the smallest in magnitude non-zero element of this matrix.
 * Let B be the largest  in magnitude non-zero element of this matrix.
 * Let L be the maximum number of non-zeros per row.
 *
 * For example, for {0,1} matrices: A=B=1.
 * Another example, for the Netflix matrix: A=1, B=5
 *
 * For those column pairs that are above the threshold,
 * the computed similarity is correct to within 20% relative error with probability
 * at least 1 - (0.981)^10/B^
 *
 * The shuffle size is bounded by the *smaller* of the following two expressions:
 *
 * O(n log(n) L / (threshold * A))
 * O(m L^2^)
 *
 * The latter is the cost of the brute-force approach, so for non-zero thresholds,
 * the cost is always cheaper than the brute-force approach.
 *
 * @param {number} [threshold]  Set to 0 for deterministic guaranteed correctness.
 *                  Similarities above this threshold are estimated
 *                  with the cost vs estimate quality trade-off described above.
 *         between columns of this matrix.
 * @returns {CoordinateMatrix}  An n x n sparse upper-triangular matrix of cosine similarities
 */
RowMatrix.prototype.columnSimilarities = function(threshold) {
  throw "not implemented by ElairJS";
// // TODO: handle optional parms 'threshold'
//   var args ={
//     target: this,
//     method: 'columnSimilarities',
//     args: [
//       { value: threshold, type: 'number' ,  optional: true}
//     ],
//     returnType: CoordinateMatrix
//
//   };
//
//   return Utils.generate(args);
};


/**
 * Compute QR decomposition for {@link RowMatrix}. The implementation is designed to optimize the QR
 * decomposition (factorization) for the {@link RowMatrix} of a tall and skinny shape.
 * Reference:
 *  Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
 *  architectures"  ([[http://dx.doi.org/10.1145/1996092.1996103]])
 *
 * @param {boolean} computeQ  whether to computeQ
 * @returns {QRDecomposition}  QRDecomposition(Q, R), Q = null if computeQ = false.
 */
RowMatrix.prototype.tallSkinnyQR = function(computeQ) {
  throw "not implemented by ElairJS";
//   var args ={
//     target: this,
//     method: 'tallSkinnyQR',
//     args: [
//       { value: computeQ, type: 'boolean' }
//     ],
//     returnType: QRDecomposition
//
//   };
//
//   return Utils.generate(args);
};

RowMatrix.prototype.rows = function() {
  var args = {
    target: this,
    method: 'rows',
    returnType: RDD
  };

  return Utils.generate(args);
};

RowMatrix.moduleLocation = '/mllib/linalg/distributed/RowMatrix';

module.exports = function(kP) {
  if (kP) gKernelP = kP;

  return RowMatrix;
};