How to use the pyspark.sql.SparkSession.builder function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / aws-data-wrangler / testing / test_awswrangler / test_spark.py View on Github external
def session():
    yield Session(spark_session=SparkSession.builder.appName("AWS Wrangler Test").getOrCreate())
github opentargets / genetics-finemapping / tests / split_qtl / split_qtl.py View on Github external
def main():

    # Args
    mol_pattern = '/home/emountjoy_statgen/data/sumstats/molecular_trait/*.parquet'
    out_dir = '/home/emountjoy_statgen/data/sumstats/molecular_trait_2/'
    # mol_pattern = '/Users/em21/Projects/genetics-finemapping/example_data/sumstats/molecular_trait/*.parquet'
    # out_dir = '/Users/em21/Projects/genetics-finemapping/example_data/sumstats/molecular_trait_2/'

    # Make spark session
    spark = (
        pyspark.sql.SparkSession.builder
        .config("parquet.enable.summary-metadata", "true")
        .getOrCreate()
    )
    print('Spark version: ', spark.version)

    # Process each
    for inf in glob(mol_pattern):

        # Load
        df = spark.read.parquet(inf)

        # Write
        outf = os.path.join(out_dir, os.path.basename(inf))
        (
            df
            .write
github Thpffcj / BigData-Getting-Started / pyspark / project / spark.py View on Github external
# -*- coding: UTF-8 -*-
# Created by thpffcj on 2019/10/19.
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf

if __name__ == '__main__':
    spark = SparkSession.builder.appName("project").getOrCreate()

    data2015 = spark.read.format("csv")\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .load("file:///Users/thpffcj/Public/file/Beijing_2015_HourlyPM25_created20160201.csv")\
        .select("Year", "Month", "Day", "Hour", "Value", "QC Name")

    data2016 = spark.read.format("csv")\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .load("file:///Users/thpffcj/Public/file/Beijing_2016_HourlyPM25_created20170201.csv")\
        .select("Year", "Month", "Day", "Hour", "Value", "QC Name")

    data2017 = spark.read.format("csv")\
        .option("header", "true")\
        .option("inferSchema", "true")\
github CharityNavigator / 990_long / python / parse_xml.py View on Github external
import argparse
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from lxml import etree
import re
import time
import datetime
import sys
import signal

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### SO 25407550
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)

### Handle command line arguments

parser = argparse.ArgumentParser()
parser.add_argument("--input", action="append", help="Path to Parquet file containing raw XML. Can supply multiple.")
parser.add_argument("--output", action="store", help="Path in which to store result. Can be local or S3.", default="990_long/parsed")
parser.add_argument("--timestamp", action="store_true", help="If true, append the timestamp to the output path.")
parser.add_argument("--partitions", type=int, action="store", help="Number of partitions to use for XML parsing.", default=500)
parser.add_argument("--timeout", type=int, action="store", help="Number of seconds to spend parsing a single 990 eFile before it is skipped.", default=3)
parser.add_argument("--no-timeout", action="store_true", help="If true, don't time out, even if load is taking a long time.")
args = parser.parse_known_args()[0]
github uber / petastorm / examples / hello_world / external_dataset / generate_external_dataset.py View on Github external
def generate_external_dataset(output_url='file:///tmp/external_dataset'):
    """Creates an example dataset at output_url in Parquet format"""
    spark = SparkSession.builder \
        .master('local[2]') \
        .getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    rows_rdd = sc.parallelize(range(rows_count)) \
        .map(row_generator)

    spark.createDataFrame(rows_rdd). \
        write. \
        mode('overwrite'). \
        parquet(output_url)
github apache / airflow / airflow / contrib / hooks / spark_jdbc_script.py View on Github external
parser.add_argument('-jdbcTruncate', dest='truncate', action='store')
    parser.add_argument('-saveMode', dest='save_mode', action='store')
    parser.add_argument('-saveFormat', dest='save_format', action='store')
    parser.add_argument('-batchsize', dest='batch_size', action='store')
    parser.add_argument('-fetchsize', dest='fetch_size', action='store')
    parser.add_argument('-name', dest='name', action='store')
    parser.add_argument('-numPartitions', dest='num_partitions', action='store')
    parser.add_argument('-partitionColumn', dest='partition_column', action='store')
    parser.add_argument('-lowerBound', dest='lower_bound', action='store')
    parser.add_argument('-upperBound', dest='upper_bound', action='store')
    parser.add_argument('-createTableColumnTypes',
                        dest='create_table_column_types', action='store')
    arguments = parser.parse_args()

    # Disable dynamic allocation by default to allow num_executors to take effect.
    spark = SparkSession.builder \
        .appName(arguments.name) \
        .enableHiveSupport() \
        .getOrCreate()

    if arguments.cmd_type == "spark_to_jdbc":
        spark_write_to_jdbc(spark,
                            arguments.url,
                            arguments.user,
                            arguments.password,
                            arguments.metastore_table,
                            arguments.jdbc_table,
                            arguments.jdbc_driver,
                            arguments.truncate,
                            arguments.save_mode,
                            arguments.batch_size,
                            arguments.num_partitions,
github qubole / spark-on-lambda / examples / src / main / python / ml / estimator_transformer_param_example.py View on Github external
# limitations under the License.
#

"""
Estimator Transformer Param Example.
"""
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
github abilian / olapy / olapy / core / mdx / executor / spark / cube_loader.py View on Github external
# -*- encoding: utf8 -*-
from __future__ import absolute_import, division, print_function, \
    unicode_literals

import os
from typing import Dict, Text

from pyspark.sql import DataFrame, SparkSession

from olapy.core.mdx.executor.cube_loader import CubeLoader

spark = SparkSession.builder.appName("olapy").getOrCreate()


class SparkCubeLoader(CubeLoader):
    def load_tables(self):
        # type: () -> Dict[Text, DataFrame]
        """Load tables from csv files.

        :return: tables dict with table name as key and dataframe as value
        """
        tables = {}
        for file in os.listdir(self.cube_path):
            # to remove file extension ".csv"
            table_name = os.path.splitext(file)[0]
            value = spark.read.csv(
                os.path.join(self.cube_path, file),
                header=True,
github zhexiao / sparksql-stats / spark_sql.py View on Github external
def init_spark(self):
        """
        初始化spark
        :return:
        """
        self.spark = SparkSession.builder \
            .master(self.SPARK_MASTER) \
            .appName(self.APP_NAME) \
            .getOrCreate()
github sbl-sdsc / mmtf-pyspark / mmtfPyspark / ml / sequenceNgrammer.py View on Github external
data : dataset
       input dataset with column "sequence"
    n : int
       size of the n-gram
    shift : int
       start index for the n-gram
    outputCol : str
       name of the output column

    Returns
    -------
    dataset
        output dataset with appended ngram column
    '''

    session = SparkSession.builder.getOrCreate()

    #Encoder function to be passed as User Defined Function (UDF)
    def _ngrammer(s):
        ngram = []
        i,j = 0,0
        t = int(len(s)/n)

        if len(s) < shift:
            return []

        s = s[shift:]

        while j < t:
            ngram.append(s[i: i + n])
            j += 1
            i += n