How to use the pyspark.sql.SparkSession function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github IamMayankThakur / test-bigdata / adminmgr / media / code / A2 / python / task / BD_188_1000_1767.py View on Github external
return key,1

def parseNeighbors(urls):
    parts = re.split(r',', urls)
    return parts[0],int(parts[2])/int(parts[3])

def parseNeigbors1(urls):
    parts = re.split(r',',urls)
    return parts[0],parts[1]

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: pagerank  ", file=sys.stderr)
        sys.exit(-1)
    # Initialize the spark context.
    spark = SparkSession\
        .builder\
        .appName("PythonPageRank")\
        .getOrCreate()
    some_value = float((float(sys.argv[3]))/100)
    if (some_value == 0):
		some_value = 0.8
    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
    links2 = lines.map(lambda urls: parseNeighbors(urls)).groupByKey().mapValues(sum).cache()
    ranks=links2.map(lambda x:compute(x[0],x[1]))
    prevranks=links2.map(lambda x:compute(x[0],x[1]))
    links1=lines.map(lambda urls: parseNeigbors1(urls)).groupByKey().cache()
    
    count_value = 0
    count = 0
    t = True
github IamMayankThakur / test-bigdata / adminmgr / media / code / A2 / python / task / BD_94_155_1509.py View on Github external
parts = re.split(r',+', urls)
    return parts[0],parts[1]
    

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: pagerank   ", file=sys.stderr)
        sys.exit(-1)
    if(int(sys.argv[3])==0):
        first=0.80
        second=0.20
    if(int(sys.argv[3])>0):
        first=sys.argv[3]*0.01
        second=1-first	
    # Initialize the spark context.
    spark = SparkSession\
        .builder\
        .appName("PythonPageRank")\
        .getOrCreate()
    
    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
    #print(lines.collect())
    
    links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()
    #print(links.collect())
    inter_ranks = lines.map(lambda urls: avg(urls)).distinct().reduceByKey(add)
    ranks=inter_ranks.map(lambda x:(x[0],max(x[1],1))).sortBy(lambda x:(x[1],x[0]),False)
    #t = ranks.collect()
    #print(ranks)
    

    #print("RANKS:",ranks.collect())
github htorrence / pytest_examples / tests / fixtures.py View on Github external
def spark(request):
    """
    Creates a spark context

    Parameters
    ----------
    request: pytest.FixtureRequest object
        provides access to testing context
    """

    spark = (
        SparkSession
        .builder
        .appName('pytest-pyspark-local-testing')
        .master('local[2]')
        .getOrCreate()
    )

    request.addfinalizer(lambda: spark.stop())

    return spark
github TresAmigosSD / SMV / src / main / python / test_support / testconfig.py View on Github external
#   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark
github HoloClean / HoloClean-Legacy-deprecated / holoclean / OLD / app_components-OLD.py View on Github external
def start(self):
        
        spark = SparkSession.builder.getOrCreate()
        
        return spark
github apache / systemml / src / main / python / systemml / defmatrix.py View on Github external
def setSparkContext(sc):
    """
    Before using the matrix, the user needs to invoke this function if SparkContext is not previously created in the session.

    Parameters
    ----------
    sc: SparkContext
        SparkContext
    """
    matrix.sc = sc
    matrix.sparkSession = SparkSession.builder.getOrCreate()
    matrix.ml = MLContext(matrix.sc)
github sbl-sdsc / mmtf-pyspark / mmtfPyspark / io / mmtfReader.py View on Github external
def download_reduced_mmtf_files(pdbIds):
    '''Download and reads the specified PDB entries in reduced mmtf format using `MMTF web services
    `_

    Parameters
    ----------
    path : str
       Path to PDB files

    Returns
    -------
    data
       structure data as keywork/value pairs
    '''
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    
    return sc.parallelize(set(pdbIds)) \
             .map(lambda t: _get_structure(t, True)) \
             .filter(lambda t: t is not None)
github Ibotta / sk-dist / examples / eliminate / covtype.py View on Github external
Feature Elimination Lift: 0.015065852924005307
"""
print(__doc__)

import time
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from skdist.distribute.eliminate import DistFeatureEliminator
from sklearn.datasets import fetch_covtype
from pyspark.sql import SparkSession

# instantiate spark session
spark = (
    SparkSession
    .builder
    .getOrCreate()
    )
sc = spark.sparkContext

# params
scoring = "f1_weighted"
cv = 5
min_features_to_select = 10

# load data and define base classifier
X,y = fetch_covtype(return_X_y=True)
clf = RandomForestClassifier(n_estimators=100, max_depth=10)

# eliminate features, keeping at least 10
start = time.time()
github Phynixknight / WideDeepSequenceAttentionFrame / Hive_Interaction.py View on Github external
def __init__(self,app_name):
        self.spark = SparkSession \
            .builder \
            .appName(app_name) \
            .enableHiveSupport() \
            .config("spark.rpc.message.maxSize", "60") \
            .config("spark.files.overwrite", "true") \
            .config("spark.hadoop.validateOutputSpecs", "false") \
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
            .config("spark.kryoserializer.buffer.max", "2000m") \
            .config("fs.defaultFS", "hdfs://mgjcluster") \
            .getOrCreate()
github Yelp / mrjob / mrjob / spark / harness.py View on Github external
def _text_file_with_path(sc, path):
    """Return an RDD that yields (path, line) for each line in the file.

    *path* must be a single path, not a comma-separated list of paths
    """
    from pyspark.sql import SparkSession
    from pyspark.sql import functions as F

    spark = SparkSession(sc)

    df = spark.read.text(path).select([
        F.input_file_name().alias('input_file_name'),
        F.col('value')
    ])

    return df.rdd.map(
        lambda row: (row.input_file_name,
                     (row.value if isinstance(row.value, bytes)
                      else row.value.encode('utf_8')))
    )