Source: eclairjs/ml/feature/VectorIndexer.js

  1. /*
  2. * Copyright 2016 IBM Corp.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. (function () {
  17. var PipelineStage = require(EclairJS_Globals.NAMESPACE + '/ml/PipelineStage');
  18. var Logger = require(EclairJS_Globals.NAMESPACE + '/Logger');
  19. var Utils = require(EclairJS_Globals.NAMESPACE + '/Utils');
  20. /**
  21. * @classdesc
  22. * Class for indexing categorical feature columns in a dataset of {@link Vector}.
  23. *
  24. * This has 2 usage modes:
  25. * - Automatically identify categorical features (default behavior)
  26. * - This helps process a dataset of unknown vectors into a dataset with some continuous
  27. * features and some categorical features. The choice between continuous and categorical
  28. * is based upon a maxCategories parameter.
  29. * - Set maxCategories to the maximum number of categorical any categorical feature should have.
  30. * - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
  31. * If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},
  32. * and feature 1 will be declared continuous.
  33. * - Index all features, if all features are categorical
  34. * - If maxCategories is set to be very large, then this will build an index of unique
  35. * values for all features.
  36. * - Warning: This can cause problems if features are continuous since this will collect ALL
  37. * unique values to the driver.
  38. * - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
  39. * If maxCategories >= 3, then both features will be declared categorical.
  40. *
  41. * This returns a model which can transform categorical features to use 0-based indices.
  42. *
  43. * Index stability:
  44. * - This is not guaranteed to choose the same category index across multiple runs.
  45. * - If a categorical feature includes value 0, then this is guaranteed to map value 0 to index 0.
  46. * This maintains vector sparsity.
  47. * - More stability may be added in the future.
  48. *
  49. * TODO: Future extensions: The following functionality is planned for the future:
  50. * - Preserve metadata in transform; if a feature's metadata is already present, do not recompute.
  51. * - Specify certain features to not index, either via a parameter or via existing metadata.
  52. * - Add warning if a categorical feature has only 1 category.
  53. * - Add option for allowing unknown categories.
  54. * @class
  55. * @extends module:eclairjs/ml.PipelineStage
  56. * @memberof module:eclairjs/ml/feature
  57. * @param {string} [uid]
  58. */
  59. var VectorIndexer = function (uid) {
  60. this.logger = Logger.getLogger("ml.feature.VectorIndexer_js");
  61. var jvmObject;
  62. if (uid) {
  63. if (uid instanceof org.apache.spark.ml.feature.VectorIndexer) {
  64. jvmObject = uid;
  65. } else {
  66. jvmObject = new org.apache.spark.ml.feature.VectorIndexer(uid);
  67. }
  68. } else {
  69. jvmObject = new org.apache.spark.ml.feature.VectorIndexer();
  70. }
  71. PipelineStage.call(this, jvmObject);
  72. };
  73. VectorIndexer.prototype = Object.create(PipelineStage.prototype);
  74. VectorIndexer.prototype.constructor = VectorIndexer;
  75. /**
  76. * An immutable unique ID for the object and its derivatives.
  77. * @returns {string}
  78. */
  79. VectorIndexer.prototype.uid = function () {
  80. return this.getJavaObject().uid();
  81. };
  82. /**
  83. * @param {integer} value
  84. * @returns {module:eclairjs/ml/feature.VectorIndexer}
  85. */
  86. VectorIndexer.prototype.setMaxCategories = function (value) {
  87. var javaObject = this.getJavaObject().setMaxCategories(value);
  88. return new VectorIndexer(javaObject);
  89. };
  90. /**
  91. * @param {string} value
  92. * @returns {module:eclairjs/ml/feature.VectorIndexer}
  93. */
  94. VectorIndexer.prototype.setInputCol = function (value) {
  95. var javaObject = this.getJavaObject().setInputCol(value);
  96. return new VectorIndexer(javaObject);
  97. };
  98. /**
  99. * @param {string} value
  100. * @returns {module:eclairjs/ml/feature.VectorIndexer}
  101. */
  102. VectorIndexer.prototype.setOutputCol = function (value) {
  103. var javaObject = this.getJavaObject().setOutputCol(value);
  104. return new VectorIndexer(javaObject);
  105. };
  106. /**
  107. * @param {module:eclairjs/sql.Dataset} dataset
  108. * @returns {module:eclairjs/ml/feature.VectorIndexerModel}
  109. */
  110. VectorIndexer.prototype.fit = function (dataset) {
  111. var dataset_uw = Utils.unwrapObject(dataset);
  112. var javaObject = this.getJavaObject().fit(dataset_uw);
  113. return Utils.javaToJs(javaObject);
  114. };
  115. /**
  116. * @param {module:eclairjs/sql/types.StructType} schema
  117. * @returns {module:eclairjs/sql/types.StructType}
  118. */
  119. VectorIndexer.prototype.transformSchema = function (schema) {
  120. var schema_uw = Utils.unwrapObject(schema);
  121. var javaObject = this.getJavaObject().transformSchema(schema_uw);
  122. return Utils.javaToJs(javaObject);
  123. };
  124. /**
  125. * @param {module:eclairjs/ml/param.ParamMap} extra
  126. * @returns {module:eclairjs/ml/feature.VectorIndexer}
  127. */
  128. VectorIndexer.prototype.copy = function (extra) {
  129. var extra_uw = Utils.unwrapObject(extra);
  130. var javaObject = this.getJavaObject().copy(extra_uw);
  131. return new VectorIndexer(javaObject);
  132. };
  133. /**
  134. * @returns {module:eclairjs/ml/param.IntParam}
  135. */
  136. VectorIndexer.prototype.maxCategories = function () {
  137. var javaObject = this.getJavaObject().maxCategories();
  138. return Utils.javaToJs(javaObject);
  139. };
  140. /**
  141. * @returns {integer}
  142. */
  143. VectorIndexer.prototype.getMaxCategories = function () {
  144. return this.getJavaObject().getMaxCategories();
  145. };
  146. //
  147. // static methods
  148. //
  149. /**
  150. * @param {string} path
  151. * @returns {module:eclairjs/ml/feature.VectorIndexer}
  152. */
  153. VectorIndexer.load = function (path) {
  154. var javaObject = org.apache.spark.ml.feature.VectorIndexer.load(path);
  155. return new VectorIndexer(javaObject);
  156. };
  157. module.exports = VectorIndexer;
  158. })();