/*
* Copyright 2015 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
(function () {
var JavaWrapper = require(EclairJS_Globals.NAMESPACE + '/JavaWrapper');
var Utils = require(EclairJS_Globals.NAMESPACE + '/Utils');
var Logger = require(EclairJS_Globals.NAMESPACE + '/Logger');
var logger = Logger.getLogger("sql.DataFrameReader_js");
/**
* @constructor
* @memberof module:eclairjs/sql
* @classdesc Interface used to load a Dataset from external storage systems (e.g. file systems, key-value stores, etc).
* Use SQLContext.read to access this.
*/
var DataFrameReader = function (javaDataFrameReader) {
var jvmObj;
jvmObj = javaDataFrameReader;
JavaWrapper.call(this, jvmObj);
};
DataFrameReader.prototype = Object.create(JavaWrapper.prototype);
//Set the "constructor" property to refer to DataFrameReader
DataFrameReader.prototype.constructor = DataFrameReader;
/**
* Specifies the input data source format.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} source
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.format = function (source) {
var javaObject = this.getJavaObject().format(source);
return new DataFrameReader(javaObject);
}
/**
* Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema
* automatically from data. By specifying the schema here, the underlying data source can
* skip the schema inference step, and thus speed up data loading.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {module:eclairjs/sql/types.StructType} schema
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.schema = function (schema) {
var schema_uw = Utils.unwrapObject(schema);
var javaObject = this.getJavaObject().schema(schema_uw);
return new DataFrameReader(javaObject);
}
/**
* Adds an input option for the underlying data source.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string | object} keyOrMap
* If object, the object is expected to be a HashMap, the key of the map is type: 'String'
* The value must be of the following type: `String`.
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.option = function (keyOrMap, value) {
var javaObject;
if (typeof keyOrMap === 'object') {
var map = Utils.createJavaHashMap(keyOrMap);
javaObject = this.getJavaObject().option(map);
} else {
javaObject = this.getJavaObject().option(keyOrMap, value);
}
return new DataFrameReader(javaObject);
};
/**
* Adds input options for the underlying data source.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {Map} map
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.options = function (map) {
var jmap = Utils.createJavaHashMap(map);
var javaObject = this.getJavaObject().options(jmap);
return new DataFrameReader(javaObject);
};
/**
* Loads input in as a {@link DataFrame}
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} [path] Loads data sources that require a path (e.g. data backed by
* a local or distributed file system). If not specified loads data sources that don't require a path (e.g. external
* key-value stores).
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.load = function (path) {
var javaObject;
if (path) {
javaObject = this.getJavaObject().load(path);
} else {
javaObject = this.getJavaObject().load();
}
return Utils.javaToJs(javaObject);
};
/**
* Construct a {@link Dataset} representing the database table accessible via JDBC URL
* @example
* // url named table and connection properties.
* var url="jdbc:mysql://localhost:3306/eclairjstesting";
* var table = "people";
* var connectionProperties = {"user" : "root", "password": "mypassword"};
* var predicates = ["age > 20"];
*
* // url named table and connection properties.
* var peopleDF = sqlContext.read().jdbc(url, table, connectionProperties);
*
* // or
* // Partitions of the table will be retrieved in parallel based on the parameters
* // passed to this function.
* // Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
* //your external database systems.
* var peopleDF = sqlContext.read().jdbc(url,table,columnName,lowerBound,upperBound,numPartitions,connectionProperties);
*
* // or
* // url named table using connection properties. The `predicates` parameter gives a list
* // expressions suitable for inclusion in WHERE clauses; each one defines one partition of the {@link Dataset}.
* // Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
* // your external database systems.
* var peopleDF = sqlContext.read().jdbc(url,table,predicates,connectionProperties);
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} url
* @param {string} table
* @param {object | string | string[]} connectionPropertiesMap|columnName|predicates
* If connectionPropertiesMap connectionProperties JDBC database connection arguments, a map of arbitrary string tag/value.
* Normally at least a "user" and "password" property should be included.
* If columnName the name of a column of integral type that will be used for partitioning.
* If predicates Condition in the where clause for each partition.
* @param {number | object} lowerBound|connectionPropertiesMap
* If lowerBound the minimum value of `columnName` used to decide partition stride
* If connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property should be included.
* @param {number} upperBound the maximum value of `columnName` used to decide partition stride
* @param {number} numPartitions the number of partitions. the range `minValue`-`maxValue` will be split
* evenly into this many partitions
* @param {object} connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property
* should be included.
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.jdbc = function () {
var javaObject;
if (arguments.length == 3) {
// connectionPropertiesMap
var map = Utils.createJavaHashMap(arguments[2], new java.util.Properties());
javaObject = this.getJavaObject().jdbc(arguments[0], arguments[1], map);
} else if (arguments.length == 4) {
// predicates
var map = Utils.createJavaHashMap(arguments[3], new java.util.Properties());
javaObject = this.getJavaObject().jdbc(arguments[0], arguments[1], arguments[2], map);
} else if (arguments.length == 7) {
var map = Utils.createJavaHashMap(arguments[6], new java.util.Properties());
javaObject = this.getJavaObject().jdbc(arguments[0], arguments[1], arguments[2], arguments[3], arguments[4], arguments[5], map);
} else {
throw "DataFrameReader.jdbc() invalid number of arguments.";
}
return Utils.javaToJs(javaObject);
}
/**
* Loads a JSON file, or RDD[String] storing JSON objects (one object per line) and returns the result as a {@link Dataset}.
* If path this function goes through the input once to determine the input schema. If you know the
* schema in advance, use the version that specifies the schema to avoid the extra scan.
* If RDD unless the schema is specified using {@link schema} function, this function goes through the
* input once to determine the input schema.
* @param {...string | module:eclairjs.RDD} path or RDD
* @returns {module:eclairjs/sql.Dataset}
* @since EclairJS 0.1 Spark 1.4.0
*/
DataFrameReader.prototype.json = function () {
var RDD = require(EclairJS_Globals.NAMESPACE + '/RDD');
var arg = arguments[0];
if (arg instanceof RDD) {
return Utils.javaToJs(this.getJavaObject().json(Utils.unwrapObject(arg)));
} else {
return Utils.javaToJs(this.getJavaObject().json(Java.to(arguments, "java.lang.String[]")));
}
};
/**
* Loads a Parquet file, returning the result as a {@link Dataset}. This function returns an empty
* {@link Dataset} if no paths are passed in.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {...string} path
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.parquet = function () {
var javaObject = this.getJavaObject().parquet(Java.to(arguments, "java.lang.String[]"));
return Utils.javaToJs(javaObject);
};
/**
* Loads an ORC file and returns the result as a {@link Dataset}.
*
* @param {...string} path input path
* @since EclairJS 0.1 Spark 1.5.0
* @note Currently, this method can only be used together with `HiveContext`.
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.orc = function () {
var javaObject = this.getJavaObject().orc(Java.to(arguments, "java.lang.String[]"));
return Utils.javaToJs(javaObject);
};
/**
* Returns the specified table as a {@link Dataset}.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} tableName
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.table = function (tableName) {
var javaObject = this.getJavaObject().table(tableName);
return Utils.javaToJs(javaObject);
};
/**
* Loads a text file and returns a {@link Dataset} with a single string column named "value".
* Each line in the text file is a new row in the resulting Dataset. For example:
* @example
* sqlContext.read().text("/path/to/spark/README.md")
*
*
* @param {...string} paths input path
* @since EclairJS 0.1 Spark 1.6.0
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.text = function () {
var javaObject = this.getJavaObject().text(Java.to(arguments, "java.lang.String[]"));
return Utils.javaToJs(javaObject);
};
/**
* Loads text files and returns a {@link Dataset} of String. See the documentation on the
* other overloaded `textFile()` method for more details.
* @since EclairJS 0.5 Spark 2.0.0
* @param {string} path
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.textFile = function(path) {
var javaObject = this.getJavaObject().textFile(path);
return Utils.javaToJs(javaObject);
};
/**
* Loads a CSV file and returns the result as a Dataset.
* @parma {...string} paths
* @returns {module:eclairjs/sql.Dataset}
*/
DataFrameReader.prototype.csv = function (path) {
return Utils.javaToJs(this.getJavaObject().csv(Java.to(arguments, "java.lang.String[]")));
};
module.exports = DataFrameReader;
})();