How to use the wasabi.msg.info function in wasabi

To help you get started, we’ve selected a few wasabi examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / sense2vec / scripts / 05_export.py View on Github external
vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
github explosion / spaCy / spacy / __main__.py View on Github external
commands = {
        "download": download,
        "link": link,
        "info": info,
        "train": train,
        "pretrain": pretrain,
        "debug-data": debug_data,
        "evaluate": evaluate,
        "convert": convert,
        "package": package,
        "init-model": init_model,
        "profile": profile,
        "validate": validate,
    }
    if len(sys.argv) == 1:
        msg.info("Available commands", ", ".join(commands), exits=1)
    command = sys.argv.pop(1)
    sys.argv[0] = "spacy %s" % command
    if command in commands:
        plac.call(commands[command], sys.argv[1:])
    else:
        available = "Available: {}".format(", ".join(commands))
        msg.fail("Unknown command: {}".format(command), available, exits=1)
github explosion / sense2vec / sense2vec / prodigy_recipes.py View on Github external
def eval_dataset(set_id):
        DB = connect()
        data = DB.get_dataset(set_id)
        accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")]
        rejected = [eg for eg in data if eg["answer"] == "reject"]
        ignored = [eg for eg in data if eg["answer"] == "ignore"]
        if not accepted and not rejected:
            msg.warn("No annotations collected", exits=1)
        total_count = 0
        agree_count = 0
        for eg in accepted:
            total_count += len(eg.get("options", []))
            agree_count += len(eg.get("accept", []))
        msg.info(f"Evaluating data from '{set_id}'")
        msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
        pc = agree_count / total_count
        text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
        if pc > 0.5:
            msg.good(text)
        else:
            msg.fail(text)
github explosion / sense2vec / scripts / 01_parse.py View on Github external
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**6):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
github explosion / sense2vec / sense2vec / prodigy_recipes.py View on Github external
if not accepted and not rejected:
            msg.warn("No annotations collected", exits=1)
        high_conf = 0.8
        agree_count = 0
        disagree_high_conf = len([e for e in rejected if e["confidence"] > high_conf])
        for eg in accepted:
            choice = eg["accept"][0]
            score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0]
            score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0]
            if score_choice > score_other:
                agree_count += 1
            elif eg["confidence"] > high_conf:
                disagree_high_conf += 1
        pc = agree_count / (len(accepted) + len(rejected))
        text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
        msg.info(f"Evaluating data from '{set_id}'")
        if pc > 0.5:
            msg.good(text)
        else:
            msg.fail(text)
        msg.text(f"You disagreed on {disagree_high_conf} high confidence scores")
        msg.text(f"You rejected {len(rejected)} suggestions as not similar")
github explosion / sense2vec / sense2vec / prodigy_recipes.py View on Github external
def eval_dataset(set_id):
        DB = connect()
        data = DB.get_dataset(set_id)
        accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")]
        rejected = [eg for eg in data if eg["answer"] == "reject"]
        ignored = [eg for eg in data if eg["answer"] == "ignore"]
        if not accepted and not rejected:
            msg.warn("No annotations collected", exits=1)
        counts = Counter()
        for eg in accepted:
            for model_id in eg["accept"]:
                counts[model_id] += 1
        preference, _ = counts.most_common(1)[0]
        ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
        msg.info(f"Evaluating data from '{set_id}'")
        msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
        if counts["A"] == counts["B"]:
            msg.warn(f"No preference ({ratio})")
        else:
            pc = counts[preference] / sum(counts.values())
            msg.good(f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
            msg.text(mapping[preference])
github explosion / sense2vec / scripts / 03_glove_build_counts.py View on Github external
msg.fail("Can't find GloVe build directory", glove_dir, exits=1)
    if not input_path.exists() or not input_path.is_dir():
        msg.fail("Not a valid input directory", in_dir, exits=1)
    input_files = [str(fp) for fp in input_path.iterdir() if fp.suffix == ".s2v"]
    if not input_files:
        msg.fail("No .s2v files found in input directory", in_dir, exits=1)
    msg.info(f"Using {len(input_files)} input files")
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")

    vocab_file = output_path / f"vocab.txt"
    cooc_file = output_path / f"cooccurrence.bin"
    cooc_shuffle_file = output_path / f"cooccurrence.shuf.bin"

    msg.info("Creating vocabulary counts")
    cmd = (
        f"cat {' '.join(input_files)} | {glove_dir}/vocab_count "
        f"-min-count {min_count} -verbose {verbose} > {vocab_file}"
    )
    print(cmd)
    vocab_cmd = os.system(cmd)
    if vocab_cmd != 0 or not Path(vocab_file).exists():
        msg.fail("Failed creating vocab counts", exits=1)
    msg.good("Created vocab counts", vocab_file)

    msg.info("Creating cooccurrence statistics")
    cmd = (
        f"cat {' '.join(input_files)} | {glove_dir}/cooccur -memory {memory} "
        f"-vocab-file {vocab_file} -verbose {verbose} "
        f"-window-size {window_size} > {cooc_file}"
    )
github explosion / spaCy / spacy / cli / profile.py View on Github external
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
github explosion / sense2vec / scripts / 04_fasttext_train_vectors.py View on Github external
msg.good("Successfully saved fastText model to disk", output_file)
    else:
        fasttext_model = None
        msg.fail("Must provide an input directory or fastText binary filepath", exits=1)

    msg.info("Creating vocabulary file")
    vocab_file = output_path / "vocab.txt"
    words, freqs = fasttext_model.get_words(include_freq=True)
    with vocab_file.open('w', encoding='utf8') as f:
        for i in range(len(words)):
            f.write(words[i] + " " + str(freqs[i]) + " word\n")
    if not vocab_file.exists() or not vocab_file.is_file():
        msg.fail("Failed to create vocabulary", vocab_file, exits=1)
    msg.good("Successfully created vocabulary file", vocab_file)

    msg.info("Creating vectors file")
    vectors_file = output_path / "vectors.txt"
    # Adapted from https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py#L31
    with vectors_file.open('w', encoding='utf-8') as file_out:
        # the first line must contain the number of total words and vector dimension
        file_out.write(str(len(words)) + " " + str(fasttext_model.get_dimension()) + '\n')
        # line by line, append vector to vectors file
        for w in words:
            v = fasttext_model.get_word_vector(w)
            vstr = ""
            for vi in v:
                vstr += " " + str(vi)
            try:
                file_out.write(w + vstr + '\n')
            except IOError as e:
                if e.errno == EPIPE:
                    pass