Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from funcy import count_by
def functional():
return count_by(itemgetter('hour'),
map(json.loads,
filter(None,
mapcat(lambda output: output.strip().split('\n'),
map(lambda date: logs[date.strftime('%Y/%m/%d')],
map(lambda days_ago: today - timedelta(days=days_ago),
range(1, days_of_logs + 1)))))))
print functional()
from toolz.curried import map, filter, mapcat, curry
count_by = curry(count_by)
from bookends import _
def piped():
return (_| range(1, days_of_logs + 1)
| map(lambda days_ago: today - timedelta(days=days_ago))
| map(lambda date: logs[date.strftime('%Y/%m/%d')])
| mapcat(lambda output: output.strip().split('\n'))
| filter(None)
| map(json.loads)
| count_by(itemgetter('hour'))
|_)
print piped()
@curry
def stability_curve_time_in_space_splitter(train_data: pd.DataFrame,
training_time_limit: DateType,
space_column: str,
time_column: str,
freq: str = 'M',
space_hold_percentage: float = 0.5,
random_state: int = None,
min_samples: int = 1000) -> SplitterReturnType:
"""
Splits the data into temporal buckets given by the specified frequency.
Training set is fixed before hold out and uses a rolling window hold out set.
Each fold moves the hold out further into the future.
Useful to see how model performance degrades as the training data gets more
outdated. Folds are made so that ALL IDs in the holdout also appear in
the training set.
@curry
def gpd_to_values(data):
"""Replace a GeoDataFrame by a data model with values.
For ``geopandas.GeoDataFrame`` columns values are stored as Foreign Members
of GeoJSON feature objects. For all other types uses function
:py:func:`altair.to_values`."""
if isinstance(data, gpd.GeoDataFrame):
data = alt.utils.sanitize_dataframe(data)
values = geopandas_to_dict(data)
return {'values': json.dumps(values)}
else:
return alt.to_values(data)
DataFrame with tolerances.
"""
from toolz.curried import curry, compose
import pandas as pd
import numpy as np
__all__ = ['duplicates_allclose']
pdapply = curry(pd.DataFrame.apply) # pylint: disable=invalid-name
sort_values = curry(pd.DataFrame.sort_values) # pylint: disable=invalid-name
pdall = curry(pd.DataFrame.all) # pylint: disable=invalid-name
duplicated = curry(pd.DataFrame.duplicated) # pylint: disable=invalid-name
diff = curry(pd.DataFrame.diff) # pylint: disable=invalid-name
npappend = curry(np.append) # pylint: disable=invalid-name
def sequence(*args):
"""Compose functions in order
Args:
args: the functions to compose
Returns:
composed functions
>>> assert sequence(lambda x: x + 1, lambda x: x * 2)(3) == 8
"""
return compose(*args[::-1])
@curry
def time_and_space_learning_curve_splitter(train_data: pd.DataFrame,
training_time_limit: str,
space_column: str,
time_column: str,
freq: str = 'M',
space_hold_percentage: float = 0.5,
holdout_gap: timedelta = timedelta(days=0),
random_state: int = None,
min_samples: int = 1000) -> SplitterReturnType:
"""
Splits the data into temporal buckets given by the specified frequency.
Uses a fixed out-of-ID and time hold out set for every fold.
Training size increases per fold, with more recent data being added in each fold.
Useful for learning curve validation, that is, for seeing how hold out performance
increases as the training size increases with more recent data.
def sample_url_line_delimited(data, lines=5, encoding='utf-8', timeout=None):
"""Get a size `length` sample from an URL CSV or URL line-delimited JSON.
Parameters
----------
data : URL(CSV)
A hosted CSV
lines : int, optional, default ``5``
Number of lines to read into memory
"""
with closing(urlopen(data.url, timeout=timeout)) as r:
raw = pipe(r, take(lines), map(bytes.strip),
curry(codecs.iterdecode, encoding=encoding),
b'\n'.decode(encoding).join)
with tmpfile(data.filename) as fn:
with codecs.open(fn, 'wb', encoding=encoding) as f:
f.write(raw)
yield fn
@curry
def sample(data, n=None, frac=None):
"""Reduce the size of the data model by sampling without replacement."""
_check_data_type(data)
if isinstance(data, pd.DataFrame):
return data.sample(n=n, frac=frac)
elif isinstance(data, dict):
if 'values' in data:
values = data['values']
n = n if n else int(frac*len(values))
values = random.sample(values, n)
return {'values': values}
@curry
def remove_by_feature_shuffling(log: LogType,
predict_fn: PredictFnType,
eval_fn: EvalFnType,
eval_data: pd.DataFrame,
extractor: ExtractorFnType,
metric_name: str,
max_removed_by_step: int = 50,
threshold: float = 0.005,
speed_up_by_importance: bool = False,
parallel: bool = False,
nthread: int = 1,
seed: int = 7) -> List[str]:
"""
Performs feature selection based on the evaluation of the test vs the
evaluation of the test with randomly shuffled features
@curry
def get_avg_metric_from_extractor(logs: LogType, extractor: ExtractorFnType, metric_name: str) -> float:
metric_folds = extract(logs["validator_log"], extractor)
return metric_folds[metric_name].mean()
@curry
def fduplicates(dcols, fcols, dataframe):
"""Check for mixture of exact duplicates and closeness
Args:
dcols: the columns to check for exact duplicates
fcols: the columns to check for closeness, a dict with column names as
keys and tolerances as values
dataframe: the dataframe
Returns:
a dataframe of the same shape but with bool values indicating
neighboring close values in the same column
>>> fduplicates(
... dcols=['C', 'D', 'E'],
... fcols=dict(A=0.02, B=0.02),