Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return key,1
def parseNeighbors(urls):
parts = re.split(r',', urls)
return parts[0],int(parts[2])/int(parts[3])
def parseNeigbors1(urls):
parts = re.split(r',',urls)
return parts[0],parts[1]
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: pagerank ", file=sys.stderr)
sys.exit(-1)
# Initialize the spark context.
spark = SparkSession\
.builder\
.appName("PythonPageRank")\
.getOrCreate()
some_value = float((float(sys.argv[3]))/100)
if (some_value == 0):
some_value = 0.8
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
links2 = lines.map(lambda urls: parseNeighbors(urls)).groupByKey().mapValues(sum).cache()
ranks=links2.map(lambda x:compute(x[0],x[1]))
prevranks=links2.map(lambda x:compute(x[0],x[1]))
links1=lines.map(lambda urls: parseNeigbors1(urls)).groupByKey().cache()
count_value = 0
count = 0
t = True
parts = re.split(r',+', urls)
return parts[0],parts[1]
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: pagerank ", file=sys.stderr)
sys.exit(-1)
if(int(sys.argv[3])==0):
first=0.80
second=0.20
if(int(sys.argv[3])>0):
first=sys.argv[3]*0.01
second=1-first
# Initialize the spark context.
spark = SparkSession\
.builder\
.appName("PythonPageRank")\
.getOrCreate()
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
#print(lines.collect())
links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()
#print(links.collect())
inter_ranks = lines.map(lambda urls: avg(urls)).distinct().reduceByKey(add)
ranks=inter_ranks.map(lambda x:(x[0],max(x[1],1))).sortBy(lambda x:(x[1],x[0]),False)
#t = ranks.collect()
#print(ranks)
#print("RANKS:",ranks.collect())
def spark(request):
"""
Creates a spark context
Parameters
----------
request: pytest.FixtureRequest object
provides access to testing context
"""
spark = (
SparkSession
.builder
.appName('pytest-pyspark-local-testing')
.master('local[2]')
.getOrCreate()
)
request.addfinalizer(lambda: spark.stop())
return spark
# * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
# * Create Scala side HiveTestContext SparkSession
# * Create python SparkSession
jgw = launch_gateway(None)
jvm = jgw.jvm
import tempfile
import getpass
hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
.set("spark.sql.hive.metastore.barrierPrefixes",
"org.apache.spark.sql.hive.execution.PairSerDe")\
.set("spark.sql.warehouse.dir", hivedir)\
.set("spark.ui.enabled", "false")
sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
cls.spark = SparkSession(sc, jss.sparkSession())
return cls.spark
def start(self):
spark = SparkSession.builder.getOrCreate()
return spark
def setSparkContext(sc):
"""
Before using the matrix, the user needs to invoke this function if SparkContext is not previously created in the session.
Parameters
----------
sc: SparkContext
SparkContext
"""
matrix.sc = sc
matrix.sparkSession = SparkSession.builder.getOrCreate()
matrix.ml = MLContext(matrix.sc)
def download_reduced_mmtf_files(pdbIds):
'''Download and reads the specified PDB entries in reduced mmtf format using `MMTF web services
`_
Parameters
----------
path : str
Path to PDB files
Returns
-------
data
structure data as keywork/value pairs
'''
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
return sc.parallelize(set(pdbIds)) \
.map(lambda t: _get_structure(t, True)) \
.filter(lambda t: t is not None)
Feature Elimination Lift: 0.015065852924005307
"""
print(__doc__)
import time
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from skdist.distribute.eliminate import DistFeatureEliminator
from sklearn.datasets import fetch_covtype
from pyspark.sql import SparkSession
# instantiate spark session
spark = (
SparkSession
.builder
.getOrCreate()
)
sc = spark.sparkContext
# params
scoring = "f1_weighted"
cv = 5
min_features_to_select = 10
# load data and define base classifier
X,y = fetch_covtype(return_X_y=True)
clf = RandomForestClassifier(n_estimators=100, max_depth=10)
# eliminate features, keeping at least 10
start = time.time()
def __init__(self,app_name):
self.spark = SparkSession \
.builder \
.appName(app_name) \
.enableHiveSupport() \
.config("spark.rpc.message.maxSize", "60") \
.config("spark.files.overwrite", "true") \
.config("spark.hadoop.validateOutputSpecs", "false") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.kryoserializer.buffer.max", "2000m") \
.config("fs.defaultFS", "hdfs://mgjcluster") \
.getOrCreate()
def _text_file_with_path(sc, path):
"""Return an RDD that yields (path, line) for each line in the file.
*path* must be a single path, not a comma-separated list of paths
"""
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession(sc)
df = spark.read.text(path).select([
F.input_file_name().alias('input_file_name'),
F.col('value')
])
return df.rdd.map(
lambda row: (row.input_file_name,
(row.value if isinstance(row.value, bytes)
else row.value.encode('utf_8')))
)