How to use the pyspark.sql.functions function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github wikimedia / search-MjoLniR / mjolnir / features.py View on Github external
if indices is None:
        indices = {}
    eltType, name, store = mjolnir.utils.explode_ltr_model_definition(model)
    log_query = LtrLoggingQuery(eltType, name, store)

    def kafka_handle_response(record):
        assert record['status_code'] == 200
        parsed = json.loads(record['text'])
        response = parsed['responses'][0]
        meta = record['meta']

        for hit_page_id, features in extract_ltr_log_feature_values(response, feature_names_accu):
            yield [meta['wikiid'], meta['query'], hit_page_id, features]

    rdd = mjolnir.kafka.client.msearch(
        df.groupBy('wikiid', 'query').agg(F.collect_set('hit_page_id').alias('hit_page_ids')),
        client_config=brokers,
        meta_keys=['wikiid', 'query'],
        create_es_query=lambda row: log_query.make_msearch(row, indices),
        handle_response=kafka_handle_response)

    return df.sql_ctx.createDataFrame(rdd, T.StructType([
        df.schema['wikiid'], df.schema['query'], df.schema['hit_page_id'],
        T.StructField('features', VectorUDT(), nullable=False)
        # We could have gotten duplicate data from kafka. Clean them up.
    ])).drop_duplicates(['wikiid', 'query', 'hit_page_id'])
github databricks / koalas / databricks / koalas / base.py View on Github external
>>> ks.Series(['a', None]).hasnans
        True

        >>> ks.Series([1.0, 2.0, np.nan]).hasnans
        True

        >>> ks.Series([1, 2, 3]).hasnans
        False

        >>> ks.Series([1, 2, 3]).rename("a").to_frame().set_index("a").index.hasnans
        False
        """
        sdf = self._internal._sdf.select(self._scol)
        col = self._scol

        ret = sdf.select(F.max(col.isNull() | F.isnan(col))).collect()[0][0]
        return ret
github databricks / koalas / databricks / koalas / namespace.py View on Github external
column_set = set(columns)
        remaining_columns = [kdf[column] for column in kdf.columns if column not in column_set]

    if any(not isinstance(kdf._internal.spark_type_for(column), _get_dummies_acceptable_types)
           for column in columns):
        raise ValueError("get_dummies currently only accept {} values"
                         .format(', '.join([t.typeName() for t in _get_dummies_acceptable_types])))

    if prefix is not None and len(columns) != len(prefix):
        raise ValueError(
            "Length of 'prefix' ({}) did not match the length of the columns being encoded ({})."
            .format(len(prefix), len(columns)))

    all_values = _reduce_spark_multi(kdf._sdf,
                                     [F.collect_set(kdf._internal.scol_for(column)).alias(column)
                                      for column in columns])
    for i, column in enumerate(columns):
        values = sorted(all_values[i])
        if drop_first:
            values = values[1:]

        def column_name(value):
            if prefix is None:
                return str(value)
            else:
                return '{}{}{}'.format(prefix[i], prefix_sep, value)

        for value in values:
            remaining_columns.append((kdf[column].notnull() & (kdf[column] == value))
                                     .astype(dtype)
                                     .rename(column_name(value)))
github horovod / horovod / examples / keras_spark_rossmann.py View on Github external
# Cast continuous columns to float & lookup categorical columns.
train_df = cast_columns(train_df, continuous_cols + ['Sales'])
train_df = lookup_columns(train_df, vocab)
test_df = cast_columns(test_df, continuous_cols)
test_df = lookup_columns(test_df, vocab)

# Split into training & validation.
# Test set is in 2015, use the same period in 2014 from the training set as a validation set.
test_min_date = test_df.agg(F.min(test_df.Date)).collect()[0][0]
test_max_date = test_df.agg(F.max(test_df.Date)).collect()[0][0]
a_year = datetime.timedelta(365)
val_df = train_df.filter((test_min_date - a_year <= train_df.Date) & (train_df.Date < test_max_date - a_year))
train_df = train_df.filter((train_df.Date < test_min_date - a_year) | (train_df.Date >= test_max_date - a_year))

# Determine max Sales number.
max_sales = train_df.agg(F.max(train_df.Sales)).collect()[0][0]

print('===================================')
print('Data frame with transformed columns')
print('===================================')
train_df.show()

print('================')
print('Data frame sizes')
print('================')
train_rows, val_rows, test_rows = train_df.count(), val_df.count(), test_df.count()
print('Training: %d' % train_rows)
print('Validation: %d' % val_rows)
print('Test: %d' % test_rows)

# Save data frames as Parquet files.
train_df.write.parquet('%s/train_df.parquet' % DATA_LOCATION, mode='overwrite')
github MrPowers / quinn / quinn / transformations.py View on Github external
            lambda col_name: F.col("`{0}`".format(col_name)).alias(fun(col_name)) if change_col_name(col_name) else F.col("`{0}`".format(col_name)),
            df.columns
github CoffeaTeam / coffea / coffea / processor / spark / spark_executor.py View on Github external
@fn.pandas_udf(StructType([StructField('histos', BinaryType(), True)]), fn.PandasUDFType.GROUPED_MAP)
def reduce_histos(df):
    global processor_instance, lz4_clevel
    return reduce_histos_raw(df, processor_instance, lz4_clevel)
github databricks / koalas / databricks / koalas / numpy_compat.py View on Github external
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import OrderedDict
from typing import Callable, Any

import numpy as np
from pyspark.sql import functions as F, Column
from pyspark.sql.types import DoubleType, LongType, BooleanType


unary_np_spark_mappings = OrderedDict({
    'abs':  F.abs,
    'absolute': F.abs,
    'arccos': F.acos,
    'arccosh': F.pandas_udf(lambda s: np.arccosh(s), DoubleType()),
    'arcsin': F.asin,
    'arcsinh': F.pandas_udf(lambda s: np.arcsinh(s), DoubleType()),
    'arctan': F.atan,
    'arctanh': F.pandas_udf(lambda s: np.arctanh(s), DoubleType()),
    'bitwise_not': F.bitwiseNOT,
    'cbrt': F.cbrt,
    'ceil': F.ceil,
    'conj': lambda _: NotImplemented,  # It requires complex type which Koalas does not support yet
    'conjugate': lambda _: NotImplemented,  # It requires complex type
    'cos': F.cos,
    'cosh': F.pandas_udf(lambda s: np.cosh(s), DoubleType()),
    'deg2rad': F.pandas_udf(lambda s: np.deg2rad(s), DoubleType()),
    'degrees': F.degrees,
github databricks / koalas / databricks / koalas / base.py View on Github external
def _shift(self, periods, fill_value, part_cols=()):
        if not isinstance(periods, int):
            raise ValueError('periods should be an int; however, got [%s]' % type(periods))

        col = self._scol
        window = Window.partitionBy(*part_cols).orderBy(self._internal.index_scols)\
            .rowsBetween(-periods, -periods)
        lag_col = F.lag(col, periods).over(window)
        col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col)
        return self._with_new_scol(col).rename(self.name)
github databricks / koalas / databricks / koalas / series.py View on Github external
def _nunique(self, dropna=True, approx=False, rsd=0.05):
        colname = self._internal.data_columns[0]
        count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct
        if dropna:
            return count_fn(self._scol).alias(colname)
        else:
            return (count_fn(self._scol) +
                    F.when(F.count(F.when(self._scol.isNull(), 1)
                                   .otherwise(None)) >= 1, 1).otherwise(0)).alias(colname)