How to use the pandas.concat function in pandas

To help you get started, we’ve selected a few pandas examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github okfn-brasil / serenata-de-amor / research / src / geocode_addresses.py View on Github external
return pd.Series()



if not os.path.exists(TEMP_PATH):
    os.makedirs(TEMP_PATH)

data = pd.read_csv(DATASET_PATH, low_memory=False)
geocoded_cnpjs = [filename[:14]
                  for filename in os.listdir(TEMP_PATH)
                  if filename.endswith('.pkl')]
is_not_geocoded = ~data['cnpj'].str.replace(CNPJ_REGEX, '').isin(geocoded_cnpjs)
remaining_companies = data[is_not_geocoded]
print('%i companies, %i to go' % (len(data), len(remaining_companies)))
geocode_companies(remaining_companies)
data = pd.concat([data,
                  data.apply(read_geocoding_info, axis=1)], axis=1)
data.to_csv(DATASET_PATH,
            compression='xz',
            encoding='utf-8',
            index=False)
shutil.rmtree(TEMP_PATH)
github Mouse-Imaging-Centre / pydpiper / pydpiper / pipelines / registration_tamarack.py View on Github external
return ys + [ys[-1] + [xs[0]]]

    xfms_to_common = (
        first_level_results
        .assign(uncomposed_xfms=suffixes(list(before.xfm))[:-1] + [None] + prefixes(list(after.xfm))[1:])
        .assign(xfm_to_common=lambda df: df.apply(axis=1, func=lambda row:
                                ((lambda x: s.defer(invert_xfmhandler(x)) if row.group >= common_time_pt else x)
                                   (s.defer(concat_xfmhandlers(row.uncomposed_xfms,
                                                               name=("%s_to_common"
                                                                     if row.group < common_time_pt
                                                                     else "%s_from_common") % row.group))))
                                  if row.uncomposed_xfms is not None else None))
        .drop('uncomposed_xfms', axis=1))  # TODO None => identity??

    # TODO indexing here is not good ...
    first_level_determinants = pd.concat(list(first_level_results.build_model.apply(
                                                lambda x: x.determinants.assign(first_level_avg=x.avg_img))),
                                         ignore_index=True)

    resampled_determinants = (
        pd.merge(left=first_level_determinants,
                 right=xfms_to_common.assign(source=lambda df: df.xfm_to_common.apply(
                                                              lambda x:
                                                                x.source if x is not None else None)),
                 left_on="first_level_avg", right_on='source')
        .assign(resampled_log_full_det=lambda df: df.apply(axis=1, func=lambda row:
                                         s.defer(mincresample_new(img=row.log_full_det,
                                                                  xfm=row.xfm_to_common.xfm,
                                                                  like=common_model))
                                                 if row.xfm_to_common is not None else row.img),
                resampled_log_nlin_det=lambda df: df.apply(axis=1, func=lambda row:
                                         s.defer(mincresample_new(img=row.log_nlin_det,
github KrishnaswamyLab / scprep / scprep / io.py View on Github external
def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs):
    """Read a csv file into a pandas.SparseDataFrame
    """
    chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs)
    data = pd.concat(chunk.to_sparse(fill_value=fill_value)
                     for chunk in chunks)
    return data
github oscar-franzen / alona / alona / hvg.py View on Github external
num_bin = 20

        gene_mean = self._exp_mean(data_norm)
        # equal width (not size) of bins
        bins = pd.cut(gene_mean, num_bin)

        ret = []

        for _, sliced in data_norm.groupby(bins):
            # Axis 0 will act on all the ROWS in each COLUMN
            # Axis 1 will act on all the COLUMNS in each ROW
            dispersion = sliced.var(axis=1)/sliced.mean(axis=1)
            zscores = (dispersion-dispersion.mean())/dispersion.std()
            ret.append(zscores)

        ret = pd.concat(ret)
        ret = ret.sort_values(ascending=False)
        self.top_hvg = ret.head(self.hvg_n)

        ret = np.array(self.top_hvg.index)
        log_debug('Finishing hvg_seurat()')
        return ret
github lenskit / lkpy / lenskit / batch.py View on Github external
_logger.info('starting predict process with %d workers', nprocs)
        with MPRecContext(algo, model),  Pool(nprocs) as pool:
            results = pool.map(_predict_worker, pairs.groupby('user'))
        results = [pd.read_msgpack(r) for r in results]
        _logger.info('finished predictions')
    else:
        results = []
        for user, udf in pairs.groupby('user'):
            if pfun:
                res = pfun(user, udf['item'])
                res = pd.DataFrame({'user': user, 'item': res.index, 'prediction': res.values})
            else:
                res = _predict_user(algo, model, user, udf)
            results.append(res)

    results = pd.concat(results)
    if 'rating' in pairs:
        return pairs.join(results.set_index(['user', 'item']), on=('user', 'item'))
    return results
github ikki407 / stacking / examples / Santander / ikki_feat_ver3.py View on Github external
#print 'starting cleaning_rfe...'
    #train, test = cleaning_rfe(ori_train=train.copy(), ori_test=test.copy())
    #print 'done cleaning_rfe'

    # make dummy variables of var3 in the threshold(>=5)
    var3_cnt = train.var3.value_counts()
    index_var3_th = var3_cnt[(var3_cnt>=5).values].index
    train['var3_tmp'] = train.var3.apply(lambda x: x if x in index_var3_th else np.nan)
    test['var3_tmp'] = test.var3.apply(lambda x: x if x in index_var3_th else np.nan)
    
    train_test = pd.concat([train,test])
    #train_test.reset_index(drop=True, inplace=True)
    tmp = pd.get_dummies(train_test['var3_tmp'], prefix='ohe_var3', prefix_sep='_')

    train = pd.concat([train, tmp.iloc[:len(train),:]], axis=1)
    test = pd.concat([test, tmp.iloc[len(train):,:]], axis=1)
    del train['var3_tmp'], test['var3_tmp']

    # add feature of var38
    train['var38mc'] = np.isclose(train.var38, 117310.979016)
    train['logvar38'] = train.loc[~train['var38mc'], 'var38'].map(np.log)
    train.loc[train['var38mc'], 'logvar38'] = 0

    test['var38mc'] = np.isclose(test.var38, 117310.979016)
    test['logvar38'] = test.loc[~test['var38mc'], 'var38'].map(np.log)
    test.loc[test['var38mc'], 'logvar38'] = 0

    train['var38mc'] = train['var38mc'].astype(int)

    test['var38mc'] = test['var38mc'].astype(int)
github ActivitySim / populationsim / populationsim / multi_integerizer.py View on Github external
zone_weights_df[sub_geography] = zone_id
        zone_weights_df['balanced_weight'] = weights.values
        zone_weights_df['integer_weight'] = integer_weights.astype(int).values

        if status in STATUS_SUCCESS:
            integerized_weights_list.append(zone_weights_df)
            integerized_zone_ids.append(zone_id)
        else:
            rounded_weights_list.append(zone_weights_df)
            rounded_zone_ids.append(zone_id)

    if combine_results:
        integerized_weights_df = pd.concat(integerized_weights_list + rounded_weights_list)
        return integerized_weights_df

    integerized_weights_df = pd.concat(integerized_weights_list) if integerized_zone_ids else None
    rounded_weights_df = pd.concat(rounded_weights_list) if rounded_zone_ids else None

    return integerized_zone_ids, rounded_zone_ids, integerized_weights_df, rounded_weights_df
github roclark / sportsreference / sportsreference / nba / schedule.py View on Github external
def dataframe(self):
        """
        Returns a pandas DataFrame where each row is a representation of the
        Game class. Rows are indexed by the boxscore string.
        """
        frames = []
        for game in self.__iter__():
            df = game.dataframe
            if df is not None:
                frames.append(df)
        if frames == []:
            return None
        return pd.concat(frames)
github microsoft / anomalydetector / aml_module / sr_detector.py View on Github external
frame['timestamp'] = timestamp
        frame['value'] = data_to_detect.iloc[:, 0]
        output = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity)
        log_plot_result(frame, output, data_to_detect.columns[0], detect_mode)
    else:
        logging.debug(f'detect {column_length} columns')
        output = pd.DataFrame()

        for col in data_to_detect.columns:
            frame = pd.DataFrame(columns=['timestamp', 'value'])
            frame['timestamp'] = timestamp
            frame['value'] = data_to_detect[col]
            result = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity)
            log_plot_result(frame, result, col, detect_mode)
            result.columns = [f'{rc}_{col}' for rc in result.columns]
            output = pd.concat((output, result), axis=1)

    return output