How to use the pyspark.ml.param.Param function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github qubole / spark-on-lambda / python / pyspark / ml / tests.py View on Github external
def __init__(self):
        super(HasInducedError, self).__init__()
        self.inducedError = Param(self, "inducedError",
                                  "Uniformly-distributed error added to feature")
github UCLA-VAST / blaze / spark-1.5.1 / python / pyspark / ml / tuning.py View on Github external
def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3):
        """
        __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3)
        """
        super(CrossValidator, self).__init__()
        #: param for estimator to be cross-validated
        self.estimator = Param(self, "estimator", "estimator to be cross-validated")
        #: param for estimator param maps
        self.estimatorParamMaps = Param(self, "estimatorParamMaps", "estimator param maps")
        #: param for the evaluator used to select hyper-parameters that
        #: maximize the cross-validated metric
        self.evaluator = Param(
            self, "evaluator",
            "evaluator used to select hyper-parameters that maximize the cross-validated metric")
        #: param for number of folds for cross validation
        self.numFolds = Param(self, "numFolds", "number of folds for cross validation")
        self._setDefault(numFolds=3)
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)
github aws / sagemaker-spark / sagemaker-pyspark-sdk / src / sagemaker_pyspark / algorithms / LinearLearnerSageMakerEstimator.py View on Github external
typeConverter=TypeConverters.toInt)

    epochs = Param(Params._dummy(), "epochs",
                   "The number of passes done over the training data. Must be > 0. ",
                   typeConverter=TypeConverters.toInt)

    predictor_type = Param(Params._dummy(), "predictor_type",
                           "Whether training is for binary classification or regression. "
                           "Supported options: 'binary_classifier', and 'regressor'. ",
                           typeConverter=TypeConverters.toString)

    use_bias = Param(Params._dummy(), "use_bias",
                     "Whether model should include bias. ",
                     typeConverter=TypeConverters.toString)

    num_models = Param(Params._dummy(), "num_models",
                       "Number of models to train in parallel. Must be > 0  or 'auto'. ",
                       typeConverter=TypeConverters.toString)

    num_calibration_samples = Param(Params._dummy(), "num_calibration_samples",
                                    "Number of samples to use from validation dataset for doing "
                                    "model calibration (finding the best threshold). "
                                    "Must be > 0.",
                                    typeConverter=TypeConverters.toInt)

    init_method = Param(Params._dummy(), "init_method",
                        "Initialization function for the model weights. "
                        "Supported options: 'uniform' and 'normal'. ",
                        typeConverter=TypeConverters.toString)

    init_scale = Param(Params._dummy(), "init_scale",
                       "Scale for init method uniform. Must be > 0. ",
github aws / sagemaker-spark / sagemaker-pyspark-sdk / src / sagemaker_pyspark / algorithms / XGBoostSageMakerEstimator.py View on Github external
gamma = Param(
        Params._dummy(), "gamma",
        "Minimum loss reduction required to make an additional partition on a leaf node"
        " of the tree. The larger the value, the more conservative the algorithm will be."
        "Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    max_depth = Param(
        Params._dummy(), "max_depth",
        "Maximum depth of a tree. Increasing this value makes the model more complex and "
        "likely to be overfitted. 0 indicates no limit. A limit is required when"
        "grow_policy=depth-wise. Must be >= 0. Default value is 6",
        typeConverter=TypeConverters.toInt)

    min_child_weight = Param(
        Params._dummy(), "min_child_weight",
        "Minimum sum of instance weight (hessian) needed in a child. If the tree partition step "
        "results in a leaf node with the sum of instance weight less than min_child_weight, then "
        "the building process will give up further partitioning. In linear regression mode, "
        "this simply corresponds to minimum number of instances needed to be in each node. "
        "The larger the value, the more conservative the algorithm will be. Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    max_delta_step = Param(
        Params._dummy(), "max_delta_step",
        "Maximum delta step we allow each tree's weight estimation to be. "
        "If the value is set to 0, it means there is no constraint. If it is set to a positive "
        "value, it can help make the update step more conservative. Usually this parameter is "
        "not needed, but it might help in logistic regression when the classes are extremely"
        " imbalanced. Setting it to value of 1-10 might help control the update. Must be >= 0.",
        typeConverter=TypeConverters.toFloat)
github aws / sagemaker-spark / sagemaker-pyspark-sdk / src / sagemaker_pyspark / algorithms / XGBoostSageMakerEstimator.py View on Github external
"Which booster to use. Can be 'gbtree', 'gblinear' or 'dart'. "
        "gbtree and dart use tree based model while gblinear uses linear function.",
        typeConverter=TypeConverters.toString)

    silent = Param(
        Params._dummy(), "silent",
        "Whether in silent mode."
        "0 means print running messages, 1 means silent mode.",
        typeConverter=TypeConverters.toInt)

    nthread = Param(
        Params._dummy(), "nthread",
        "Number of parallel threads used to run xgboot. Must be >= 1.",
        typeConverter=TypeConverters.toInt)

    eta = Param(
        Params._dummy(), "eta",
        "Step size shrinkage used in update to prevent overfitting. After each boosting step, "
        "we can directly get the weights of new features. and eta shrinks the feature weights "
        "to make the boosting process more conservative. Must be in [0, 1].",
        typeConverter=TypeConverters.toFloat)

    gamma = Param(
        Params._dummy(), "gamma",
        "Minimum loss reduction required to make an additional partition on a leaf node"
        " of the tree. The larger the value, the more conservative the algorithm will be."
        "Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    max_depth = Param(
        Params._dummy(), "max_depth",
        "Maximum depth of a tree. Increasing this value makes the model more complex and "
github lifeomic / sparkflow / sparkflow / tensorflow_async.py View on Github external
tensorflowGraph = Param(Params._dummy(), "tensorflowGraph", "", typeConverter=TypeConverters.toString)
    tfInput = Param(Params._dummy(), "tfInput", "", typeConverter=TypeConverters.toString)
    tfOutput = Param(Params._dummy(), "tfOutput", "", typeConverter=TypeConverters.toString)
    tfLabel = Param(Params._dummy(), "tfLabel", "", typeConverter=TypeConverters.toString)
    tfOptimizer = Param(Params._dummy(), "tfOptimizer", "", typeConverter=TypeConverters.toString)
    tfLearningRate = Param(Params._dummy(), "tfLearningRate", "", typeConverter=TypeConverters.toFloat)
    iters = Param(Params._dummy(), "iters", "", typeConverter=TypeConverters.toInt)
    partitions = Param(Params._dummy(), "partitions", "", typeConverter=TypeConverters.toInt)
    miniBatchSize = Param(Params._dummy(), "miniBatchSize", "", typeConverter=TypeConverters.toInt)
    miniStochasticIters = Param(Params._dummy(), "miniStochasticIters", "", typeConverter=TypeConverters.toInt)
    verbose = Param(Params._dummy(), "verbose", "", typeConverter=TypeConverters.toInt)
    acquireLock = Param(Params._dummy(), "acquireLock", "", typeConverter=TypeConverters.toBoolean)
    shufflePerIter = Param(Params._dummy(), "shufflePerIter", "", typeConverter=TypeConverters.toBoolean)
    tfDropout = Param(Params._dummy(), "tfDropout", "", typeConverter=TypeConverters.toString)
    toKeepDropout = Param(Params._dummy(), "toKeepDropout", "", typeConverter=TypeConverters.toBoolean)
    partitionShuffles = Param(Params._dummy(), "partitionShuffles", "", typeConverter=TypeConverters.toInt)
    optimizerOptions = Param(Params._dummy(), "optimizerOptions", "", typeConverter=TypeConverters.toString)
    port = Param(Params._dummy(), "port", "", typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self,
                 inputCol=None,
                 tensorflowGraph=None,
                 tfInput=None,
                 tfLabel=None,
                 tfOutput=None,
                 tfOptimizer=None,
                 tfLearningRate=None,
                 iters=None,
                 predictionCol=None,
                 partitions=None,
                 miniBatchSize = None,
github dmmiller612 / sparktorch / sparktorch / torch_distributed.py View on Github external
PysparkReaderWriter,
    MLReadable,
    MLWritable,
    Identifiable
):

    torchObj = Param(Params._dummy(), "torchObj", "The serialized torch object", typeConverter=TypeConverters.toString)
    mode = Param(Params._dummy(), "mode", "The training mode", typeConverter=TypeConverters.toString)
    device = Param(Params._dummy(), "device", "", typeConverter=TypeConverters.toString)
    iters = Param(Params._dummy(), "iters", "", typeConverter=TypeConverters.toInt)
    partitions = Param(Params._dummy(), "partitions", "", typeConverter=TypeConverters.toInt)
    verbose = Param(Params._dummy(), "verbose", "", typeConverter=TypeConverters.toInt)
    acquireLock = Param(Params._dummy(), "acquireLock", "", typeConverter=TypeConverters.toBoolean)
    partitionShuffles = Param(Params._dummy(), "partitionShuffles", "", typeConverter=TypeConverters.toInt)
    port = Param(Params._dummy(), "port", "", typeConverter=TypeConverters.toInt)
    useBarrier = Param(Params._dummy(), "useBarrier", "", typeConverter=TypeConverters.toBoolean)
    useVectorOut = Param(Params._dummy(), "useVectorOut", "", typeConverter=TypeConverters.toBoolean)
    earlyStopPatience = Param(Params._dummy(), "earlyStopPatience", "", typeConverter=TypeConverters.toInt)
    miniBatch = Param(Params._dummy(), "miniBatch", "", typeConverter=TypeConverters.toInt)
    validationPct = Param(Params._dummy(), "validationPct", "", typeConverter=TypeConverters.toFloat)

    @keyword_only
    def __init__(
        self,
        inputCol=None,
        labelCol=None,
        torchObj=None,
        iters=None,
        predictionCol=None,
        partitions=None,
        acquireLock=None,
        verbose=None,
github UCLA-VAST / blaze / spark-1.5.1 / python / pyspark / ml / param / shared.py View on Github external
def __init__(self):
        super(HasRawPredictionCol, self).__init__()
        #: param for raw prediction (a.k.a. confidence) column name
        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
        self._setDefault(rawPredictionCol='rawPrediction')
github UCLA-VAST / blaze / spark-1.5.1 / python / pyspark / ml / param / shared.py View on Github external
return self

    def getRegParam(self):
        """
        Gets the value of regParam or its default value.
        """
        return self.getOrDefault(self.regParam)


class HasFeaturesCol(Params):
    """
    Mixin for param featuresCol: features column name.
    """

    # a placeholder to make it appear in the generated doc
    featuresCol = Param(Params._dummy(), "featuresCol", "features column name")

    def __init__(self):
        super(HasFeaturesCol, self).__init__()
        #: param for features column name
        self.featuresCol = Param(self, "featuresCol", "features column name")
        self._setDefault(featuresCol='features')

    def setFeaturesCol(self, value):
        """
        Sets the value of :py:attr:`featuresCol`.
        """
        self._paramMap[self.featuresCol] = value
        return self

    def getFeaturesCol(self):
        """
github aws / sagemaker-spark / sagemaker-pyspark-sdk / src / sagemaker_pyspark / algorithms / XGBoostSageMakerEstimator.py View on Github external
Params._dummy(), "max_depth",
        "Maximum depth of a tree. Increasing this value makes the model more complex and "
        "likely to be overfitted. 0 indicates no limit. A limit is required when"
        "grow_policy=depth-wise. Must be >= 0. Default value is 6",
        typeConverter=TypeConverters.toInt)

    min_child_weight = Param(
        Params._dummy(), "min_child_weight",
        "Minimum sum of instance weight (hessian) needed in a child. If the tree partition step "
        "results in a leaf node with the sum of instance weight less than min_child_weight, then "
        "the building process will give up further partitioning. In linear regression mode, "
        "this simply corresponds to minimum number of instances needed to be in each node. "
        "The larger the value, the more conservative the algorithm will be. Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    max_delta_step = Param(
        Params._dummy(), "max_delta_step",
        "Maximum delta step we allow each tree's weight estimation to be. "
        "If the value is set to 0, it means there is no constraint. If it is set to a positive "
        "value, it can help make the update step more conservative. Usually this parameter is "
        "not needed, but it might help in logistic regression when the classes are extremely"
        " imbalanced. Setting it to value of 1-10 might help control the update. Must be >= 0.",
        typeConverter=TypeConverters.toFloat)

    subsample = Param(
        Params._dummy(), "subsample",
        "Subsample ratio of the training instance. Setting it to 0.5 means that XGBoost will "
        "randomly collect half of the data instances to grow trees and this will "
        "prevent overfitting. Must be (0, 1].",
        typeConverter=TypeConverters.toFloat)

    colsample_bytree = Param(