How to use the pyspark.sql.SparkSession.builder.appName function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github IamMayankThakur / test-bigdata / adminmgr / media / code / A2 / python / task / BD_1621_1634_1906_U2kyAzB.py View on Github external
for i in range(len(z)):
      u=u+float(z[i])
   if(u>1):
       return float(u)
   else:
       return 1.0
       


if __name__ == "__main__":
        if len(sys.argv) != 4:
	        print("Usage: pagerank  ", file=sys.stderr)
                sys.exit(-1)

	# Initialize the spark context.
        spark = SparkSession\
        .builder\
        .appName("PythonPageRank")\
        .getOrCreate()

        lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])

        links = lines.map(lambda batsman: parseNeighbors(batsman)).distinct().groupByKey().cache()
        
        links1=lines.map(lambda batsman: parseNeighbors1(batsman)).distinct().groupByKey().cache()
        
        
	link=links.map(lambda x:(x[0],float(sum1(list(x[1])))))
        #print(links.collect())
        ranks = link.map(lambda url_neighbors: (url_neighbors[0],url_neighbors[1]))
        #print(ranks.collect())
        iterations=0
github mesosphere / spark-build / tests / jobs / python / pi_with_include.py View on Github external
from random import random
from operator import add

from pyspark.sql import SparkSession

import PySparkTestInclude

if __name__ == "__main__":
    """
        Usage: pi [partitions]
    """

    # Make sure we can include this user-provided module
    PySparkTestInclude.func()

    spark = SparkSession\
        .builder\
        .appName("PythonPi")\
        .getOrCreate()

    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    n = 100000 * partitions

    def f(_):
        x = random() * 2 - 1
        y = random() * 2 - 1
        return 1 if x ** 2 + y ** 2 < 1 else 0

    count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
    print("Pi is roughly %f" % (4.0 * count / n))

    spark.stop()
github GoogleCloudPlatform / data-science-on-gcp / 07_sparkml_and_bqml / experiment.py View on Github external
from __future__ import print_function
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SparkSession
from pyspark import SparkContext
import numpy as np

sc = SparkContext('local', 'logistic')
spark = SparkSession \
    .builder \
    .appName("Logistic regression w/ Spark ML") \
    .getOrCreate()

BUCKET='BUCKET_NAME'

# read dataset
traindays = spark.read \
    .option("header", "true") \
    .csv('gs://{}/flights/trainday.csv'.format(BUCKET))
traindays.createOrReplaceTempView('traindays')

from pyspark.sql.types import StringType, FloatType, StructType, StructField

header = 'FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'
github w34ma / spark-cnn / cnn / spark_cnn.py View on Github external
def __init__(self, I, B):
        CNN.__init__(self, I)
        self.B = B # number of batches
        classifications = load_classifications()
        C = len(classifications)
        self.init_layers(C)
        self.C = C
        # create spark context
        spark = SparkSession.builder.appName('cnn').getOrCreate()
        self.sc = spark.sparkContext
github uber / petastorm / examples / imagenet / generate_petastorm_imagenet.py View on Github external
>>>    *.JPEG

    >>> nZZZZZZZZ/
    >>>    *.JPEG

    :param imagenet_path: a path to the directory containing ``n*/`` subdirectories. If you are running this script on
      a Spark cluster, you should have this file be mounted and accessible to executors.
    :param output_url: the location where your dataset will be written to. Should be a url: either
      ``file://...`` or ``hdfs://...``
    :param spark_master: A master parameter used by spark session builder. Use default value (``None``) to use system
      environment configured spark cluster. Use ``local[*]`` to run on a local box.
    :param noun_id_to_text: A dictionary: ``{noun_id : text}``. If ``None``, this function will download the dictionary
      from the Internet.
    :return: ``None``
    """
    session_builder = SparkSession \
        .builder \
        .appName('Imagenet Dataset Creation') \
        .config('spark.executor.memory', '10g') \
        .config('spark.driver.memory', '10g')  # Increase the memory if running locally with high number of executors
    if spark_master:
        session_builder.master(spark_master)

    spark = session_builder.getOrCreate()
    sc = spark.sparkContext

    # Get a list of noun_ids
    noun_ids = os.listdir(imagenet_path)
    if not all(noun_id.startswith('n') for noun_id in noun_ids):
        raise RuntimeError('Directory {} expected to contain only subdirectories with name '
                           'starting with "n".'.format(imagenet_path))
github lifeomic / sparkflow / examples / simple_dnn.py View on Github external
from sparkflow.pipeline_util import PysparkPipelineWrapper


def small_model():
    x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
    y = tf.placeholder(tf.float32, shape=[None, 10], name='y')
    layer1 = tf.layers.dense(x, 256, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer())
    layer2 = tf.layers.dense(layer1, 256, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer())
    out = tf.layers.dense(layer2, 10, kernel_initializer=tf.glorot_uniform_initializer())
    z = tf.argmax(out, 1, name='out')
    loss = tf.losses.softmax_cross_entropy(y, out)
    return loss


if __name__ == '__main__':
    spark = SparkSession.builder \
        .appName("examples") \
        .master('local[4]').config('spark.driver.memory', '2g') \
        .getOrCreate()

    # Read in mnist_train.csv dataset
    df = spark.read.option("inferSchema", "true").csv('examples/mnist_train.csv').orderBy(rand())

    # Build the tensorflow graph
    mg = build_graph(small_model)

    # Build the adam optimizer
    adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999)

    # Setup features
    vector_assembler = VectorAssembler(inputCols=df.columns[1:785], outputCol='features')
    encoder = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False)
github qubole / spark-on-lambda / examples / src / main / python / ml / vector_slicer_example.py View on Github external
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorSlicerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
        Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

    slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

    output = slicer.transform(df)

    output.select("userFeatures", "features").show()
    # $example off$
github qubole / spark-on-lambda / examples / src / main / python / ml / simple_text_classification_pipeline.py View on Github external
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row, SparkSession


"""
A simple text classification pipeline that recognizes "spark" from
input text. This is to show how to create and configure a Spark ML
pipeline in Python. Run with:

  bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py
"""


if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("SimpleTextClassificationPipeline")\
        .getOrCreate()

    # Prepare training documents, which are labeled.
    training = spark.createDataFrame([
        (0, "a b c d e spark", 1.0),
        (1, "b d", 0.0),
        (2, "spark f g h", 1.0),
        (3, "hadoop mapreduce", 0.0)
    ], ["id", "text", "label"])

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
github ksbg / sparklanes / sparklanes / _framework / spark.py View on Github external
def __init_spark(cls):
        cls.spark = SparkSession.builder.appName(SPARK_APP_NAME).getOrCreate()
github Azure / mmlspark / tools / helm / zepplin / mmlsparkExamples / serving.py View on Github external
import mmlspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession

from pyspark.sql.functions import length, col

spark = SparkSession.builder.appName("SimpleContServing").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("WARN")

print("creating df")
df = spark.readStream.continuousServer() \
    .address("0.0.0.0", 8888, "my_api") \
    .load() \
    .parseRequest(StructType().add("foo", StringType()).add("bar", IntegerType()))

replies = df.withColumn("fooLength", length(col("foo")))\
    .makeReply("fooLength")

print("creating server")
server = replies\
    .writeStream \
    .continuousServer() \