Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
try:
#This should work if setup.py ran successfully
import shingho
except importError:
#If setup.py fails, manually add shingho to file path (may fail for certain dependencies)
import sys
sys.path.insert(0, '../../../shingho')
import shingho
from shingho.single_thread_stats import rdd_stats, df_stats
from pyspark import SparkContext, SparkConf
import numpy as np
import unittest
#Setup Spark Context
conf = SparkConf().setAppName('single_thread_stats unit tests')
sc = SparkContext(conf=conf)
#Test data
np_data = np.array([1, 1, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 6, 7])
pd_data = pd.DataFrame({'key_col': np_data})
pd_data_grouped = pd_data.groupby('key_col')['key_col']
rdd_data = sc.parallelize(data)
#Shingho objects for rdd_stats and df_stats
rdd_stats_no_index = single_thread_stats.rdd_stats(rdd_data, index_field = 0)
df_stats_no_index = single_thread_stats.rdd_stats(rdd_data = None)
rdd_stats_index = single_thread_stats.rdd_stats(rdd_data, index_field = 0)
df_stats_index = single_thread_stats.rdd_stats(rdd_data = None)
#This unit test object will either give a 'pass' or 'fail' for each method
class Single_Thread_Stats_Unit_Test(unittest.TestCase):
from pyspark import SparkConf, SparkContext
conf = SparkConf()
sc = SparkContext(conf=conf)
rdd = sc.textFile("/user/cloudera/spark/logs.txt"). \
map(lambda x: x.split(" ")). \
map(lambda x: x[1]). \
filter(lambda x: x == "DEBUG")
print rdd.collect()
accum = sc.accumulator(0)
rdd.foreach(lambda x: accum.add(1))
print accum.value
description=desc,
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=desc)
parser.add_argument("input_path", help="lines of json to ingest")
parser.add_argument("es_resource", help="index and doc_type (my-index/doc)")
parser.add_argument("--id_field", help="id field to map into es")
parser.add_argument("--es_nodes", default="127.0.0.1:9200", help="es.nodes")
args = parser.parse_args()
nodes=[{"host":str(node), "port":9200} for node in args.es_nodes.split(',')]
print "NODES:"+str(nodes)
conf = SparkConf().setAppName("Elastic Ingest")
sc = SparkContext(conf=conf)
index=args.es_resource.split("/")[0]
type=args.es_resource.split("/")[1]
id_field=args.id_field
rdd_emails = sc.textFile(args.input_path).coalesce(50).map(lambda x: json.loads(x))
rdd_emails.foreachPartition(lambda docs: index_partition(docs, nodes, index, type, id_field))
# Compute a sparse variance-covariance matrix on the master node
covarianceMaster.compute()
# Finalize computations and retrieve the results
res = covarianceMaster.finalizeCompute()
result = {}
result['covariance'] = res.get(covariance.covariance)
result['mean'] = res.get(covariance.mean)
return result
if __name__ == "__main__":
# Create JavaSparkContext that loads defaults from the system properties and the classpath and sets the name
sc = SparkContext(conf=SparkConf().setAppName("Spark covariance(CSR)").setMaster("local[4]"))
# Read from the distributed HDFS data set at a specified path
dd = DistributedHDFSDataSet("/Spark/CovarianceCSR/data/")
dataRDD = dd.getCSRAsPairRDD(sc)
# Compute a sparse variance-covariance matrix for dataRDD
final_result = runCovariance(dataRDD)
# Redirect stdout to a file for correctness verification
stdout = sys.stdout
sys.stdout = open('CovarianceCSR.out', 'w')
# Print the results
printNumericTable(final_result['covariance'], "Covariance matrix (upper left square 10*10) :", 10, 10, 9)
printNumericTable(final_result['mean'], "Mean vector:", 1, 10, 9)
def create_spark_context():
from pyspark import SparkContext, SparkConf
from sonata.system_config import TD_PATH, T
conf = (SparkConf()
.setMaster("local[*]")
.setAppName("SONATA-Training")
.set("spark.executor.memory", "6g")
.set("spark.driver.memory", "20g")
.set("spark.cores.max", "16"))
sc = SparkContext(conf=conf)
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
return sc
Parameters
----------
application_name : string
Returns
-------
sc : SparkContext
"""
# initialize the spark configuration
self._init_spark()
import pyspark
import pyspark.sql
# initialize conf
spark_conf = pyspark.SparkConf()
for k, v in self._spark_conf_helper._conf_dict.items():
spark_conf.set(k, v)
log.info("Starting SparkContext")
return pyspark.SparkContext(appName=application_name, conf=spark_conf)
filter(lambda x: x is not None).\
flatMap(lambda xs: [x for x in xs])
parse_name_df = parse_name_rdd.toDF()
parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
mode='overwrite')
parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
filter(lambda x: x is not None).\
flatMap(lambda xs: [x for x in xs])
parse_affil_df = parse_affil_rdd.toDF()
# change to parse_affil_df
parse_affil_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
mode='overwrite')
print('Finished parsing Pubmed Open-Access subset')
conf = SparkConf().setAppName('pubmed_oa_spark')\
.setMaster('local[8]')\
.set('executor.memory', '8g')\
.set('driver.memory', '8g')\
.set('spark.driver.maxResultSize', '0')
if __name__ == '__main__':
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
is_update, date_update = update()
if is_update:
process_file(date_update)
sc.stop()
def _configure_spark(self):
logger.info('Configuring Spark')
sconf = SparkConf()
for prop, value in self._sm_config['spark'].items():
if prop.startswith('spark.'):
sconf.set(prop, value)
if 'aws' in self._sm_config:
sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id'])
sconf.set(
"spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']
)
sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sconf.set(
"spark.hadoop.fs.s3a.endpoint",
"s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_default_region']),
)
self._sc = SparkContext(
def get_spark_conf(self):
conf = dict(Spark._default_spark_conf, **self.spark_conf)
spark_conf = pyspark.SparkConf()
if conf.get('spark.app.name'): spark_conf.setAppName(conf.get('spark.app.name'))
if conf.get('spark.master'): spark_conf.setMaster(conf.get('spark.master'))
if conf.get('spark.executor.memory'):
spark_conf.set('spark.executor.memory', conf.get('spark.executor.memory'))
if conf.get('spark.cassandra.connection.host'):
spark_conf.set('spark.cassandra.connection.host', conf.get('spark.cassandra.connection.host'))
if conf.get('spark.driver.memory'):
spark_conf.set('spark.driver.memory', conf.get('spark.driver.memory'))
return spark_conf
def main(argv):
Conf = (SparkConf().setAppName("SimpleGraph"))
sc = SparkContext(conf=Conf)
sqlContext = SQLContext(sc)
# the pre-processed parquet table
dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+"-selfjoin.parquet"
# the raw Data frame
rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER).registerTempTable("self_join")
# compute indegree
indegree = sqlContext.sql("Select Subreddit as subreddit, OrigAuth as author, count(*) as rank from self_join group by Subreddit,OrigAuth ")
# save it into cassandra
indegree.write.format("org.apache.spark.sql.cassandra").options(table ="indegree", keyspace =keyspace).save(mode="append")
# outdegree
outdegree = sqlContext.sql("Select Subreddit as subreddit, RespAuth as author, count(*) as rank from self_join group by Subreddit,RespAuth")
# save it into cassandra
outdegree.write.format("org.apache.spark.sql.cassandra").options(table ="outdegree",keyspace =keyspace).save(mode="append")