Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Returns:
A result containing item keys per field which contained any trash symbol
"""
garbage = (
r"(?P^\s|\s$)"
r"|(?P&[a-zA-Z]{2,}?;|&#\d*?;)"
r"|(?P[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
r"|(?P|)"
)
errors = {}
row_keys: Set = set()
rule_result = Result("Garbage Symbols", items_count=len(df))
for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"):
matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
if not matches.empty:
error_keys = df.loc[matches.unstack().index.values].index
bad_texts = matches.stack().value_counts().index.sort_values().tolist()
# escape backslashes for markdown repr, `\n > \\n`
bad_texts = [
f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
for bx in bad_texts
]
error = (
f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
f"values contain `{', '.join(bad_texts)}`"
)
errors[error] = list(error_keys)
row_keys = row_keys.union(error_keys)
def tqdm_notebook(*args, **kwargs): # pragma: no cover
"""See tqdm.notebook.tqdm for full documentation"""
from .notebook import tqdm as _tqdm_notebook
from warnings import warn
warn("This function will be removed in tqdm==5.0.0\n"
"Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`",
TqdmDeprecationWarning, stacklevel=2)
return _tqdm_notebook(*args, **kwargs)
def submit_contributions(
self, contributions, skip_dupe_check=False, ignore_dupes=False, limit=200
):
"""Convenience function to submit a list of contributions"""
# prepare structures/tables
with tqdm(total=len(contributions)) as pbar:
existing = {
"ids": set(),
"identifiers": set(),
"structures": set(),
"tables": set(),
}
unique_identifiers = True
if not skip_dupe_check:
name = contributions[0]["project"]
resp = self.projects.get_entry(
pk=name, _fields=["unique_identifiers"]
).result()
unique_identifiers = resp["unique_identifiers"]
pbar.set_description("Get existing contribution(s)")
def categorize(df: pd.DataFrame) -> pd.DataFrame:
"""Cast columns with repeating values to `category` type to save memory"""
if len(df) < 100:
return
for c in tqdm(df.columns, desc="Categorizing"):
try:
if df[c].nunique(dropna=False) <= 10:
df[c] = df[c].astype("category")
# ignore lists and dicts columns
except TypeError:
continue
def f(*args, **kw):
if getattr(CURRENT, "context", None) is None:
if sys.stdout.isatty():
klass = tqdm
elif using_notebook():
klass = notebook.tqdm
elif isinstance(sys.stdout, io.FileIO):
klass = LogFileOutput
else:
klass = None
if klass is None:
CURRENT.context = NULL_CONTEXT
else:
CURRENT.context = ProgressContext(klass)
parent = CURRENT.context
show_progress = kw.pop("show_progress", None)
if show_progress is False:
subcontext = NULL_CONTEXT
else:
subcontext = parent.subcontext()
kw["ui"] = CURRENT.context = subcontext
of possible values, including `NAN`.
Args:
df: data
max_uniques: filter which determines which columns to use. Only columns with
the number of unique values less than or equal to `max_uniques` are category columns.
Returns:
A result with stats containing value counts of categorical columns.
"""
result = Result("Categories")
columns = find_likely_cats(df, max_uniques)
result.stats = [
value_counts
for value_counts in tqdm(
map(lambda c: df[c].value_counts(dropna=False), columns),
desc="Finding categories",
total=len(columns),
)
if len(value_counts) <= max_uniques
]
if not result.stats:
result.add_info("Categories were not found")
return result
result.add_info(f"{len(result.stats)} category field(s)")
result.outcome = Outcome.INFO
return result
def delete_contributions(self, project):
"""Convenience function to remove all contributions for a project"""
resp = self.contributions.get_entries(
project=project, _fields=["id"], _limit=1
).result()
ncontribs = resp["total_count"]
if ncontribs:
has_more, limit = True, 250
with tqdm(total=ncontribs) as pbar:
pbar.set_description("Delete contribution(s)")
while has_more:
resp = self.contributions.delete_entries(
project=project, _limit=limit
).result()
has_more = resp["has_more"]
pbar.update(resp["count"])
if resp["count"]:
self.load()