How to use the nltk.probability.FreqDist function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nltk / nltk / nltk / probability.py View on Github external
def __le__(self, other):
        if not isinstance(other, FreqDist):
            raise_unorderable_types("<=", self, other)
        return set(self).issubset(other) and all(
            self[key] <= other[key] for key in self
        )
github SPOClab-ca / COVFEFE / utils / lexicosyntactic / syntactic_features.py View on Github external
try:
                parsetree_features["average_VP_length"] = 1.0*lenVP/totVP
            except:
                parsetree_features["average_VP_length"] = 0
            try:
                parsetree_features["average_NP_length"] = 1.0*lenNP/totNP
            except:
                parsetree_features["average_NP_length"] = 0

            parsetree_keys += ['PP_type_prop', 'VP_type_prop', 'NP_type_prop',
                               'PP_type_rate', 'VP_type_rate', 'NP_type_rate',
                               'average_PP_length', 'average_VP_length', 'average_NP_length']

            # Normalize by number of productions
            num_productions = len(prod_nonlexical)
            fdist = nltk.probability.FreqDist(prod_nonlexical)

            for prod_rule in top_rules: # need this to ensure we always get same number of CFG features
                if prod_rule in fdist:
                    parsetree_features[prod_rule] = 1.0 * fdist[prod_rule] / num_productions
                else:
                    parsetree_features[prod_rule] = 0.0
                parsetree_keys += [prod_rule]
        
                
    return parsetree_keys, parsetree_features
github mdenil / txtnets / code / train_scripts / train_evaluate_naive_bayes.py View on Github external
def load_data(file_name):
    data_dir = os.path.join("../data", "stanfordmovie")

    with open(os.path.join(data_dir, file_name)) as data_file:
        raw_data = json.load(data_file)
        train_x, train_y = map(list, zip(*raw_data))
        # train_x, train_y = map(list, zip(*raw_data[:100]))

        data = []
        for sentences, label in zip(train_x, train_y):
            words = [w for s in sentences for w in s]
            data.append((FreqDist(words), label))

    return data
github sloria / TextBlob / nltk / probability.py View on Github external
def gt_demo():
    from nltk import corpus
    emma_words = corpus.gutenberg.words('austen-emma.txt')
    fd = FreqDist(emma_words)
    gt = GoodTuringProbDist(fd)
    sgt = SimpleGoodTuringProbDist(fd)
    katz = SimpleGoodTuringProbDist(fd, 7)
    print('%18s %8s  %12s %14s  %12s' \
        % ("word", "freqency", "GoodTuring", "SimpleGoodTuring", "Katz-cutoff" ))
    for key in fd:
        print('%18s %8d  %12e   %14e   %12e' \
            % (key, fd[key], gt.prob(key), sgt.prob(key), katz.prob(key)))
github rplevy / clojure-nltk / resources / nltk / classify / decisiontree.py View on Github external
    @staticmethod
    def leaf(labeled_featuresets):
        label = FreqDist([label for (featureset,label)
                          in labeled_featuresets]).max()
        return DecisionTreeClassifier(label)
github nltk / nltk / nltk / classify / decisiontree.py View on Github external
def leaf(labeled_featuresets):
        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
        return DecisionTreeClassifier(label)
github sloria / TextBlob / nltk / app / collocations_app.py View on Github external
def run(self):
            try:
                words = self.model.CORPORA[self.name]()
                from operator import itemgetter
                text = filter(lambda w: len(w) > 2, words)
                fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1))
                vocab = FreqDist(text)
                scored = [((w1,w2), fd[(w1,w2)] ** 3 / float(vocab[w1] * vocab[w2])) for w1, w2 in fd]
                scored.sort(key=itemgetter(1), reverse=True)
                self.model.collocations = list(map(itemgetter(0), scored))
                self.model.notify_listeners(CORPUS_LOADED_EVENT)
            except Exception as e:
                print(e)
                self.model.notify_listeners(ERROR_LOADING_CORPUS_EVENT)
github sloria / TextBlob / nltk / classify / decisiontree.py View on Github external
def leaf(labeled_featuresets):
        label = FreqDist(label for (featureset,label)
                         in labeled_featuresets).max()
        return DecisionTreeClassifier(label)
github rplevy / clojure-nltk / resources / nltk / classify / naivebayes.py View on Github external
    @staticmethod
    def train(labeled_featuresets, estimator=ELEProbDist):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        # Count up how many times each feature value occurred, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:
            label_freqdist.inc(label)
            for fname, fval in featureset.items():
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname].inc(fval)
                # Record that fname can take the value fval.
                feature_values[fname].add(fval)
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then