How to use dataprep - 10 common examples

To help you get started, we’ve selected a few dataprep examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Azure-Samples / MachineLearningSamples-ChurnPrediction / CATelcoCustomerChurnModeling.py View on Github external
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import csv
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from azureml.logging import get_azureml_logger

# initialize the logger
run_logger = get_azureml_logger() 
run_logger.log('amlrealworld.ChurnPrediction.CATelcoCustomerChurnModeling','true')

with Package.open_package('CATelcoCustomerChurnTrainingSample.dprep') as pkg:
    df = pkg.dataflows[0].get_dataframe(spark=False)

columns_to_encode = list(df.select_dtypes(include=['category','object']))
for column_to_encode in columns_to_encode:
    dummies = pd.get_dummies(df[column_to_encode])
    one_hot_col_names = []
    for col_name in list(dummies.columns):
        one_hot_col_names.append(column_to_encode + '_' + col_name)
    dummies.columns = one_hot_col_names
    df = df.drop(column_to_encode, axis=1)
    df = df.join(dummies)

model = GaussianNB()

random_seed = 42
train, test = train_test_split(df, random_state = random_seed, test_size = 0.3)
github iamprem / summarizer / tryit.py View on Github external
    wordcloud = vertices.map(lambda l: dataprep.clean_vertex(l))
    vertices.cache()
github iamprem / summarizer / textrank.py View on Github external
    vertices = revsents.map(lambda l: dataprep.clean_vertex(l))
    revsents = revsents.cache()
github iamprem / summarizer / textrank.py View on Github external
    graph = vertices.map(lambda ver: dataprep.create_adjlist(ver, allvertices))
github iamprem / summarizer / tryit.py View on Github external
    graph = wordcloud.map(lambda ver: dataprep.create_adjlist(ver, vert_cache)).filter(lambda l: len(l[1]) > 0).cache() #Remove this filter if not much use
    rank = graph.map(lambda (vert, neighbors): (vert, 0.15))
github iamprem / summarizer / tryit.py View on Github external
    vertices = sc.textFile(path).flatMap(lambda review: dataprep.create_vertices(review))
    wordcloud = vertices.map(lambda l: dataprep.clean_vertex(l))
github iamprem / summarizer / textrank.py View on Github external
    revsents = sc.textFile(path).flatMap(lambda review: dataprep.create_vertices(review))
github allenai / spv2 / dataprep.py View on Github external
def documents_from_file(filename):
    with (bz2.open(filename, 'rt', encoding="UTF-8")) as f:
        for line in f:
            try:
                yield Document.from_json(json.loads(line))
            except ValueError as e:
                logging.warning("Error while reading document (%s); skipping", e)
github allenai / spv2 / dataprep.py View on Github external
while True:
                        try:
                            yield pickle.load(p.stdout)
                            doc_count += 1
                        except EOFError:
                            break
                    assert doc_count >= 400, "Number of documents (%d) was less than expected (400) from %s. File is likely incomplete" % (
                        doc_count, labeled_and_featurized_tokens_path
                    )
            else:
                logging.warning(
                    "Could not find %s, recreating it", labeled_and_featurized_tokens_path
                )
                nonlocal token_stats
                if token_stats is None:
                    token_stats = TokenStatistics(os.path.join(dirname, "all.tokenstats2.gz"))

                temp_labeled_and_featurized_tokens_path = \
                    labeled_and_featurized_tokens_path + ".%d.temp" % os.getpid()
                with multiprocessing_generator.ParallelGenerator(
                    labeled_tokens(), max_lookahead=64
                ) as docs:
                    docs = docs_with_normalized_features(
                        model_settings.max_page_number,
                        model_settings.token_hash_size,
                        model_settings.font_hash_size,
                        token_stats,
                        docs)
                    with gzip.open(temp_labeled_and_featurized_tokens_path, "wb") as f:
                        for doc in docs:
                            yield doc
                            pickle.dump(doc, f)
github allenai / spv2 / dataprep.py View on Github external
def from_json(cls, json_doc):
        doc_id = json_doc["docId"]
        pages = [Page.from_json(p) for p in json_doc.get("pages", [])]
        return Document(doc_id=doc_id, pages=pages)