Source: ml/clustering/LDAModel.js

/*
 * Copyright 2016 IBM Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

module.exports = function(kernelP) {
  return (function() {
    var Utils = require('../../utils.js');

    var Model = require('../Model')();

    var gKernelP = kernelP;

    /**
     * @classdesc
     * Model fitted by {@link module:eclairjs/ml/clustering.LDA}.
     *
     * @class
     * @extends module:eclairjs/ml.Model
     * @memberof module:eclairjs/ml/clustering
     */
    function LDAModel() {
      Utils.handleConstructor(this, arguments, gKernelP);
    }

    LDAModel.prototype = Object.create(Model.prototype);

    LDAModel.prototype.constructor = LDAModel;

    /**
     * The features for LDA should be a {@link Vector} representing the word counts in a document.
     * The vector should be of length vocabSize, with counts for each term (word).
     * @param {string} value
     * @returns {module:eclairjs/mllib/clustering.LDAModel}
     */
    LDAModel.prototype.setFeaturesCol = function(value) {
      var args = {
        target: this,
        method: 'setFeaturesCol',
        args: Utils.wrapArguments(arguments),
        returnType: LDAModel
      };

      return Utils.generate(args);
    };

    /**
     * @param {number} value
     * @returns {module:eclairjs/mllib/clustering.LDAModel}
     */
    LDAModel.prototype.setSeed = function(value) {
      var args = {
        target: this,
        method: 'setSeed',
        args: Utils.wrapArguments(arguments),
        returnType: LDAModel
      };

      return Utils.generate(args);
    };

    /**
     * Transforms the input dataset.
     *
     * WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when {@link optimizer}
     *          is set to "em"), this involves collecting a large {@link topicsMatrix} to the driver.
     *          This implementation may be changed in the future.
     * @param {module:eclairjs/sql.Dataset} dataset
     * @returns {module:eclairjs/sql.Dataset}
     */
    LDAModel.prototype.transform = function(dataset) {
      var Dataset = require('../../sql/Dataset');

      var args = {
        target: this,
        method: 'transform',
        args: Utils.wrapArguments(arguments),
        returnType: Dataset
      };

      return Utils.generate(args);
    };

    /**
     * @param {module:eclairjs/sql/types.StructType} schema
     * @returns {module:eclairjs/sql/types.StructType}
     */
    LDAModel.prototype.transformSchema = function(schema) {
      var StructType = require('../../sql/types/StructType')();

      var args = {
        target: this,
        method: 'transformSchema',
        args: Utils.wrapArguments(arguments),
        returnType: StructType
      };

      return Utils.generate(args);
    };

    /**
     * Value for {@link docConcentration} estimated from data.
     * If Online LDA was used and {@link optimizeDocConcentration} was set to false,
     * then this returns the fixed (given) value for the {@link docConcentration} parameter.
     * @returns {module:eclairjs/mllib/linalg.Vector}
     */
    LDAModel.prototype.estimatedDocConcentration = function() {
      var Vector = require('../../mllib/linalg/Vector');

      var args = {
        target: this,
        method: 'estimatedDocConcentration',
        args: Utils.wrapArguments(arguments),
        returnType: Vector
      };

      return Utils.generate(args);
    };

    /**
     * Inferred topics, where each topic is represented by a distribution over terms.
     * This is a matrix of size vocabSize x k, where each column is a topic.
     * No guarantees are given about the ordering of the topics.
     *
     * WARNING: If this model is actually a {@link DistributedLDAModel} instance produced by
     *          the Expectation-Maximization ("em") {@link optimizer}, then this method could involve
     *          collecting a large amount of data to the driver (on the order of vocabSize x k).
     * @returns {module:eclairjs/mllib/linalg.Matrix}
     */
    LDAModel.prototype.topicsMatrix = function() {
      var Matrix = require('../../mllib/linalg/Matrix');

      var args = {
        target: this,
        method: 'topicsMatrix',
        args: Utils.wrapArguments(arguments),
        returnType: Matrix
      };

      return Utils.generate(args);
    };

    /**
     *  Indicates whether this instance is of type {@link DistributedLDAModel}
     * @returns {Promise.<boolean>}
     */
    LDAModel.prototype.isDistributed = function() {
      var args = {
        target: this,
        method: 'isDistributed',
        args: Utils.wrapArguments(arguments),
        returnType: Boolean
      };

      return Utils.generate(args);
    };

    /**
     * Calculates a lower bound on the log likelihood of the entire corpus.
     *
     * See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
     *
     * WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when {@link optimizer}
     *          is set to "em"), this involves collecting a large {@link topicsMatrix} to the driver.
     *          This implementation may be changed in the future.
     *
     * @param {module:eclairjs/sql.Dataset} dataset   test corpus to use for calculating log likelihood
     * @returns {Promise.<number>}  variational lower bound on the log likelihood of the entire corpus
     */
    LDAModel.prototype.logLikelihood = function(dataset) {
      var args = {
        target: this,
        method: 'logLikelihood',
        args: Utils.wrapArguments(arguments),
        returnType: Number
      };

      return Utils.generate(args);
    };

    /**
     * Calculate an upper bound bound on perplexity.  (Lower is better.)
     * See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
     *
     * WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when {@link optimizer}
     *          is set to "em"), this involves collecting a large {@link topicsMatrix} to the driver.
     *          This implementation may be changed in the future.
     *
     * @param {module:eclairjs/sql.Dataset} dataset  test corpus to use for calculating perplexity
     * @returns {Promise.<number>}  Variational upper bound on log perplexity per token.
     */
    LDAModel.prototype.logPerplexity = function(dataset) {
      var args = {
        target: this,
        method: 'logPerplexity',
        args: Utils.wrapArguments(arguments),
        returnType: Number
      };

      return Utils.generate(args);
    };

    /**
     * Return the topics described by their top-weighted terms.
     *
     * @param {number} [maxTermsPerTopic]   Maximum number of terms to collect for each topic.
     *                          Default value of 10.
     *           - "topic": IntegerType: topic index
     *           - "termIndices": ArrayType(IntegerType): term indices, sorted in order of decreasing
     *                            term importance
     *           - "termWeights": ArrayType(DoubleType): corresponding sorted term weights
     * @returns {module:eclairjs/sql.Dataset}   Local Dataset with one topic per Row, with columns:
     */
    LDAModel.prototype.describeTopics = function(maxTermsPerTopic) {
      var Dataset = require('../../sql/Dataset');

      var args = {
        target: this,
        method: 'describeTopics',
        args: Utils.wrapArguments(arguments),
        returnType: Dataset
      };

      return Utils.generate(args);
    };

    LDAModel.moduleLocation = '/ml/clustering/LDAModel';

    return LDAModel;
  })();
};