How to use the pyspark.SparkConf function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github snazrul1 / Shingho / test_files / unit_tests / single_thread_stats.py View on Github external
try:
  #This should work if setup.py ran successfully
  import shingho
except importError:
  #If setup.py fails, manually add shingho to file path (may fail for certain dependencies)
  import sys
  sys.path.insert(0, '../../../shingho')
  import shingho
  
from shingho.single_thread_stats import rdd_stats, df_stats
from pyspark import SparkContext, SparkConf
import numpy as np
import unittest

#Setup Spark Context
conf = SparkConf().setAppName('single_thread_stats unit tests')
sc = SparkContext(conf=conf)

#Test data
np_data = np.array([1, 1, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 6, 7])
pd_data = pd.DataFrame({'key_col': np_data})
pd_data_grouped = pd_data.groupby('key_col')['key_col']
rdd_data = sc.parallelize(data)

#Shingho objects for rdd_stats and df_stats
rdd_stats_no_index = single_thread_stats.rdd_stats(rdd_data, index_field = 0)
df_stats_no_index = single_thread_stats.rdd_stats(rdd_data = None)
rdd_stats_index = single_thread_stats.rdd_stats(rdd_data, index_field = 0)
df_stats_index = single_thread_stats.rdd_stats(rdd_data = None)

#This unit test object will either give a 'pass' or 'fail' for each method 
class Single_Thread_Stats_Unit_Test(unittest.TestCase):
github xuezhizeng / CCA175-Exam-Preparation / 02-Transform Stage and Store / shared variables / accumulators.py View on Github external
from pyspark import SparkConf, SparkContext

conf = SparkConf()
sc = SparkContext(conf=conf)

rdd = sc.textFile("/user/cloudera/spark/logs.txt"). \
  map(lambda x: x.split(" ")). \
  map(lambda x: x[1]). \
  filter(lambda x: x == "DEBUG")

print rdd.collect()

accum = sc.accumulator(0)

rdd.foreach(lambda x: accum.add(1))

print accum.value
github Sotera / pst-extraction / spark / es_simple_ingest.py View on Github external
description=desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=desc)

    parser.add_argument("input_path", help="lines of json to ingest")
    parser.add_argument("es_resource", help="index and doc_type (my-index/doc)")
    parser.add_argument("--id_field", help="id field to map into es")
    parser.add_argument("--es_nodes", default="127.0.0.1:9200", help="es.nodes")

    args = parser.parse_args()

    nodes=[{"host":str(node), "port":9200} for node in args.es_nodes.split(',')]

    print "NODES:"+str(nodes)

    conf = SparkConf().setAppName("Elastic Ingest")
    sc = SparkContext(conf=conf)

    index=args.es_resource.split("/")[0]
    type=args.es_resource.split("/")[1]
    id_field=args.id_field

    rdd_emails = sc.textFile(args.input_path).coalesce(50).map(lambda x: json.loads(x))
    rdd_emails.foreachPartition(lambda docs: index_partition(docs, nodes, index, type, id_field))
github intel / daal / samples / python / spark / sources / spark_CovarianceCSR.py View on Github external
# Compute a sparse variance-covariance matrix on the master node
    covarianceMaster.compute()

    # Finalize computations and retrieve the results
    res = covarianceMaster.finalizeCompute()

    result = {}
    result['covariance'] = res.get(covariance.covariance)
    result['mean'] = res.get(covariance.mean)

    return result

if __name__ == "__main__":

    # Create JavaSparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(conf=SparkConf().setAppName("Spark covariance(CSR)").setMaster("local[4]"))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/CovarianceCSR/data/")
    dataRDD = dd.getCSRAsPairRDD(sc)

    # Compute a sparse variance-covariance matrix for dataRDD
    final_result = runCovariance(dataRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('CovarianceCSR.out', 'w')

    # Print the results
    printNumericTable(final_result['covariance'], "Covariance matrix (upper left square 10*10) :", 10, 10, 9)
    printNumericTable(final_result['mean'], "Mean vector:", 1, 10, 9)
github Sonata-Princeton / SONATA-DEV / sonata / core / training / utils.py View on Github external
def create_spark_context():
    from pyspark import SparkContext, SparkConf
    from sonata.system_config import TD_PATH, T
    conf = (SparkConf()
            .setMaster("local[*]")
            .setAppName("SONATA-Training")
            .set("spark.executor.memory", "6g")
            .set("spark.driver.memory", "20g")
            .set("spark.cores.max", "16"))

    sc = SparkContext(conf=conf)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

    return sc
github Valassis-Digital-Media / spylon / spylon / spark / launcher.py View on Github external
Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """

        # initialize the spark configuration
        self._init_spark()
        import pyspark
        import pyspark.sql

        # initialize conf
        spark_conf = pyspark.SparkConf()
        for k, v in self._spark_conf_helper._conf_dict.items():
            spark_conf.set(k, v)

        log.info("Starting SparkContext")
        return pyspark.SparkContext(appName=application_name, conf=spark_conf)
github titipata / pubmed_parser / scripts / pubmed_oa_spark.py View on Github external
filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_name_df = parse_name_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
                                mode='overwrite')

    parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_affil_df = parse_affil_rdd.toDF()
    # change to parse_affil_df
    parse_affil_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
                                mode='overwrite')
    print('Finished parsing Pubmed Open-Access subset')

conf = SparkConf().setAppName('pubmed_oa_spark')\
    .setMaster('local[8]')\
    .set('executor.memory', '8g')\
    .set('driver.memory', '8g')\
    .set('spark.driver.maxResultSize', '0')

if __name__ == '__main__':
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    is_update, date_update = update()
    if is_update:
        process_file(date_update)
    sc.stop()
github metaspace2020 / metaspace / metaspace / engine / sm / engine / annotation_job.py View on Github external
def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id'])
            sconf.set(
                "spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']
            )
            sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set(
                "spark.hadoop.fs.s3a.endpoint",
                "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_default_region']),
            )

        self._sc = SparkContext(
github PeachstoneIO / peachbox / peachbox / spark.py View on Github external
def get_spark_conf(self):
        conf       = dict(Spark._default_spark_conf, **self.spark_conf)
        spark_conf = pyspark.SparkConf()

        if conf.get('spark.app.name'): spark_conf.setAppName(conf.get('spark.app.name'))
        if conf.get('spark.master'):   spark_conf.setMaster(conf.get('spark.master'))
        if conf.get('spark.executor.memory'): 
            spark_conf.set('spark.executor.memory', conf.get('spark.executor.memory'))
        if conf.get('spark.cassandra.connection.host'): 
            spark_conf.set('spark.cassandra.connection.host', conf.get('spark.cassandra.connection.host'))
        if conf.get('spark.driver.memory'): 
            spark_conf.set('spark.driver.memory', conf.get('spark.driver.memory'))
        return spark_conf
github aravindr18 / RedditR--Insight-Data-Engineering-Project / SimpleGraph / degree_compute.py View on Github external
def main(argv):
    Conf = (SparkConf().setAppName("SimpleGraph"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)
    
    # the pre-processed parquet table
    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+"-selfjoin.parquet"
    # the raw Data frame 
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER).registerTempTable("self_join")
    
    # compute indegree
    indegree = sqlContext.sql("Select Subreddit as subreddit, OrigAuth as author, count(*) as rank from self_join group by Subreddit,OrigAuth ")
    # save it into cassandra
    indegree.write.format("org.apache.spark.sql.cassandra").options(table ="indegree", keyspace =keyspace).save(mode="append")
    # outdegree
    outdegree = sqlContext.sql("Select Subreddit as subreddit, RespAuth as author, count(*) as rank from self_join group by Subreddit,RespAuth")
	# save it into cassandra
    outdegree.write.format("org.apache.spark.sql.cassandra").options(table ="outdegree",keyspace =keyspace).save(mode="append")