/*
* Copyright 2016 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
(function () {
var JavaWrapper = require(EclairJS_Globals.NAMESPACE + '/JavaWrapper');
var Logger = require(EclairJS_Globals.NAMESPACE + '/Logger');
var Utils = require(EclairJS_Globals.NAMESPACE + '/Utils');
/**
* Word2Vec creates vector representation of words in a text corpus.
* The algorithm first constructs a vocabulary from the corpus
* and then learns vector representation of words in the vocabulary.
* The vector representation can be used as features in
* natural language processing and machine learning algorithms.
*
* We used skip-gram model in our implementation and hierarchical softmax
* method to train the model. The variable names in the implementation
* matches the original C implementation.
*
* For original C implementation, see https://code.google.com/p/word2vec/
* For research papers, see
* Efficient Estimation of Word Representations in Vector Space
* and
* Distributed Representations of Words and Phrases and their Compositionality.
* @memberof module:eclairjs/mllib/feature
* @classdesc
* @class
*/
var Word2Vec = function (jvmObject) {
this.logger = Logger.getLogger("mllib_feature_Word2Vec_js");
if (!jvmObject) {
jvmObject = new org.apache.spark.mllib.feature.Word2Vec();
}
JavaWrapper.call(this, jvmObject);
};
Word2Vec.prototype = Object.create(JavaWrapper.prototype);
Word2Vec.prototype.constructor = Word2Vec;
/**
* Sets vector size (default: 100).
* @param {integer} vectorSize
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setVectorSize = function (vectorSize) {
var javaObject = this.getJavaObject().setVectorSize(vectorSize);
return new Word2Vec(javaObject);
};
/**
* Sets initial learning rate (default: 0.025).
* @param {float} learningRate
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setLearningRate = function (learningRate) {
var javaObject = this.getJavaObject().setLearningRate(learningRate);
return new Word2Vec(javaObject);
};
/**
* Sets number of partitions (default: 1). Use a small number for accuracy.
* @param {integer} numPartitions
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setNumPartitions = function (numPartitions) {
var javaObject = this.getJavaObject().setNumPartitions(numPartitions);
return new Word2Vec(javaObject);
};
/**
* Sets number of iterations (default: 1), which should be smaller than or equal to number of
* partitions.
* @param {integer} numIterations
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setNumIterations = function (numIterations) {
var javaObject = this.getJavaObject().setNumIterations(numIterations);
return new Word2Vec(javaObject);
};
/**
* Sets random seed (default: a random integer).
* @param {integer} seed
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setSeed = function (seed) {
var javaObject = this.getJavaObject().setSeed(seed);
return new Word2Vec(javaObject);
};
/**
* Sets the window of words (default: 5)
* @param {integer} window
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setWindowSize = function (window) {
var javaObject = this.getJavaObject().setWindowSize(window);
return new Word2Vec(javaObject);
};
/**
* Sets minCount, the minimum number of times a token must appear to be included in the word2vec
* model's vocabulary (default: 5).
* @param {integer} minCount
* @returns {module:eclairjs/mllib/feature.Word2Vec}
*/
Word2Vec.prototype.setMinCount = function (minCount) {
var javaObject = this.getJavaObject().setMinCount(minCount);
return new Word2Vec(javaObject);
};
/**
* Computes the vector representation of each word in vocabulary.
* @param {module:eclairjs.RDD} dataset an RDD of words
* @returns {module:eclairjs/mllib/feature.Word2VecModel} a Word2VecModel
*/
Word2Vec.prototype.fit = function (dataset) {
var dataset_uw = Utils.unwrapObject(dataset);
var javaObject = this.getJavaObject().fit(dataset_uw);
return Utils.javaToJs(javaObject);
};
module.exports = Word2Vec;
})();