/*
* Copyright 2015 IBM Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var Utils = require('../utils.js');
var RDD = require('../rdd/RDD.js');
var DataFrame = require('./DataFrame.js');
/**
* @constructor
* @memberof module:eclairjs/sql
* @classdesc Interface used to load a DataFrame from external storage systems (e.g. file systems, key-value stores, etc).
* Use SQLContext.read to access this.
*/
function DataFrameReader(kernelP, refIdP) {
this.kernelP = kernelP;
this.refIdP = refIdP;
}
/**
* Specifies the input data source format.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string}
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.format = function(source) {
var args = {
target: this,
method: 'format',
args: Utils.wrapArguments(arguments),
returnType: DataFrameReader
};
return Utils.generate(args);
};
/**
* Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema
* automatically from data. By specifying the schema here, the underlying data source can
* skip the schema inference step, and thus speed up data loading.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {module:eclairjs/sql/types.StructType}
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.schema = function(schema) {
var args = {
target: this,
method: 'schema',
args: Utils.wrapArguments(arguments),
returnType: DataFrameReader
};
return Utils.generate(args);
};
/**
* Adds an input option for the underlying data source.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} key
* @param {string} value
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.option = function(key, value) {
var args = {
target: this,
method: 'option',
args: Utils.wrapArguments(arguments),
returnType: DataFrameReader
};
return Utils.generate(args);
};
/**
* Adds input options for the underlying data source.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {Map}
* @returns {module:eclairjs/sql.DataFrameReader}
*/
DataFrameReader.prototype.options = function(options) {
var args = {
target: this,
method: 'options',
args: Utils.wrapArguments(arguments),
returnType: DataFrameReader
};
return Utils.generate(args);
};
/**
* Loads input in as a {@link DataFrame}
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} [path] Loads data sources that require a path (e.g. data backed by
* a local or distributed file system). If not specified loads data sources that don't require a path (e.g. external
* key-value stores).
* @returns {module:eclairjs/sql.DataFrame}
*/
DataFrameReader.prototype.load = function(path) {
var args = {
target: this,
method: 'load',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Construct a {@link DataFrame} representing the database table accessible via JDBC URL
* @example
* // url named table and connection properties.
* var url="jdbc:mysql://localhost:3306/eclairjstesting";
* var table = "people";
* var connectionProperties = {"user" : "root", "password": "mypassword"};
* var predicates = ["age > 20"];
*
* // url named table and connection properties.
* var peopleDF = sqlContext.read().jdbc(url, table, connectionProperties);
*
* // or
* // Partitions of the table will be retrieved in parallel based on the parameters
* // passed to this function.
* // Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
* //your external database systems.
* var peopleDF = sqlContext.read().jdbc(url,table,columnName,lowerBound,upperBound,numPartitions,connectionProperties);
*
* // or
* // url named table using connection properties. The `predicates` parameter gives a list
* // expressions suitable for inclusion in WHERE clauses; each one defines one partition of the {@link DataFrame}.
* // Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
* // your external database systems.
* var peopleDF = sqlContext.read().jdbc(url,table,predicates,connectionProperties);
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} url
* @param {string} table
* @param {object | string | string[]} connectionPropertiesMap|columnName|predicates
* If connectionPropertiesMap connectionProperties JDBC database connection arguments, a map of arbitrary string tag/value.
* Normally at least a "user" and "password" property should be included.
* If columnName the name of a column of integral type that will be used for partitioning.
* If predicates Condition in the where clause for each partition.
* @param {number | object} lowerBound|connectionPropertiesMap
* If lowerBound the minimum value of `columnName` used to decide partition stride
* If connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property should be included.
* @param {number} upperBound the maximum value of `columnName` used to decide partition stride
* @param {number} numPartitions the number of partitions. the range `minValue`-`maxValue` will be split
* evenly into this many partitions
* @param {object} connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property
* should be included.
* @returns {module:eclairjs/sql.DataFrame}
*/
DataFrameReader.prototype.jdbc = function() {
var args = {
target: this,
method: 'jdbc',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Loads a JSON file, or RDD[String] storing JSON objects (one object per line) and returns the result as a DataFrame.
* @param {string | RDD}
* @returns {module:eclairjs/sql.DataFrame}
*/
DataFrameReader.prototype.json = function(input) {
var args = {
target: this,
method: 'json',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Loads a Parquet file, returning the result as a {@link DataFrame}. This function returns an empty
* {@link DataFrame} if no paths are passed in.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string} paths,...paths
* @returns {module:eclairjs/sql.DataFrame}
*/
DataFrameReader.prototype.parquet = function() {
var args = {
target: this,
method: 'parquet',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Loads an ORC file and returns the result as a {@link DataFrame}.
*
* @param {string} path input path
* @since EclairJS 0.1 Spark 1.5.0
* @note Currently, this method can only be used together with `HiveContext`.
* @returns {module:eclairjs/sql.DataFrame}
*/
DataFrameReader.prototype.orc = function(path) {
var args = {
target: this,
method: 'orc',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
/**
* Returns the specified table as a {@link DataFrame}.
*
* @since EclairJS 0.1 Spark 1.4.0
* @param {string}
* @returns {module:eclairjs/sql.DataFrame}
*/
DataFrameReader.prototype.table = function(tableName) {
var args = {
target: this,
method: 'table',
args: Utils.wrapArguments(arguments),
returnType: DataFrame
};
return Utils.generate(args);
};
module.exports = DataFrameReader;