How to use the scattertext.features.FeatsFromSpacyDoc.FeatsFromSpacyDoc function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / scattertext / CorpusFromParsedDocuments.py View on Github external
def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
github JasonKessler / scattertext / scattertext / features / FeatsFromSpacyDocAndEmpath.py View on Github external
from collections import Counter
from functools import partial
from sys import version_info

from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc


class FeatsFromSpacyDocAndEmpath(FeatsFromSpacyDoc):
	def __init__(self,
	             use_lemmas=False,
	             entity_types_to_censor=set(),
	             tag_types_to_censor=set(),
	             strip_final_period=False,
	             empath_analyze_function=None,
	             **kwargs):
		'''
		Parameters
		----------
		empath_analyze_function: function (default=empath.Empath().analyze)
			Function that produces a dictionary mapping Empath categories to

		Other parameters from FeatsFromSpacyDoc.__init__
		'''
		if empath_analyze_function is None:
github JasonKessler / scattertext / scattertext / features / PhraseMachinePhrases.py View on Github external
from collections import Counter

from scattertext.external.phrasemachine import phrasemachine
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc


class PhraseMachinePhrases(FeatsFromSpacyDoc):
	'''
	Returns unigrams and phrase machine phrases
	'''

	def get_feats(self, doc):
		'''
		Parameters
		----------
		doc, Spacy Doc

		Returns
		-------
		Counter noun chunk -> count
		'''
		ngram_counter = Counter()
		for sent in doc.sents:
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
...u'Did sometimes march? by heaven I charge thee, speak!',
         ...u'Halt! Who goes there?',
         ...u'[Intro]',
         ...u'It is I sire Tone from Brooklyn.',
         ...u'Well, speak up man what is it?',
         ...u'News from the East sire! THE BEST OF BOTH WORLDS HAS RETURNED!']
       >>> categories = ['hamlet'] * 4 + ['jay-z/r. kelly'] * 5
       >>> clean_function = lambda text: '' if text.startswith('[') else text
       >>> term_doc_mat = ST.TermDocMatrixFactory(category_text_iter = zip(categories, documents),clean_function = clean_function).build()
        """
        self._category_text_iter = category_text_iter
        self._clean_function = clean_function
        self._nlp = nlp
        self._entity_types_to_censor = set()
        if feats_from_spacy_doc is None:
            self._feats_from_spacy_doc = FeatsFromSpacyDoc()
        else:
            self._feats_from_spacy_doc = feats_from_spacy_doc
github JasonKessler / scattertext / scattertext / features / UseFullDocAsMetadata.py View on Github external
from collections import Counter

from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc


class UseFullDocAsMetadata(FeatsFromSpacyDoc):
    def get_feats(self, doc):
        return Counter()

    def get_doc_metadata(self, doc):
        '''
        Parameters
        ----------
        doc, Spacy Docs

        Returns
        -------
        Counter str -> count
        '''
        return Counter({str(doc): 1})
github JasonKessler / scattertext / scattertext / features / FeatsFromTopicModel.py View on Github external
)
		return text_df

	def get_doc_metadata(self, doc, prefix=''):
		feature_counter = Counter()
		if version_info[0] >= 3:
			doc = str(doc)
		for category, score in self._analyze(doc).to_dict()[0].items():
			feature_counter[prefix + category] = int(score)
		return feature_counter

	@abstractmethod
	def _get_terms_from_doc(self, doc):
		pass

class FeatsFromTopicModel(FeatsFromSpacyDoc, FeatsFromTopicModelBase):
	def __init__(self,
	             topic_model,
	             use_lemmas=False,
	             entity_types_to_censor=set(),
	             tag_types_to_censor=set(),
	             strip_final_period=False,
	             **kwargs):
		'''
		Parameters
		----------
		topic_model : dict
			{topicmodelname: [term1, term2, ....], ...}

		Other parameters from FeatsFromSpacyDoc.__init__
		'''
		check_topic_model_string_format(topic_model)
github JasonKessler / scattertext / scattertext / features / PhraseMachinePhrases.py View on Github external
'''
		Parameters
		----------
		doc, Spacy Doc

		Returns
		-------
		Counter noun chunk -> count
		'''
		ngram_counter = Counter()
		for sent in doc.sents:
			ngram_counter += _phrase_counts(sent)
		return ngram_counter


class PhraseMachinePhrasesAndUnigrams(FeatsFromSpacyDoc):
	'''
	Returns unigrams and phrase machine phrases
	'''

	def get_feats(self, doc):
		'''
		Parameters
		----------
		doc, Spacy Doc

		Returns
		-------
		Counter noun chunk -> count
		'''
		# ngram_counter = phrasemachine.get_phrases(str(doc), tagger='spacy')['counts']
		ngram_counter = Counter()
github JasonKessler / scattertext / scattertext / features / FeatsFromGeneralInquirer.py View on Github external
from collections import Counter
from re import split
from sys import version_info

import pandas as pd

from scattertext.Common import GENERAL_INQUIRER_URL
from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc


class FeatsFromGeneralInquirer(FeatsFromSpacyDoc):
    def __init__(self,
                 use_lemmas=False,
                 entity_types_to_censor=set(),
                 tag_types_to_censor=set(),
                 strip_final_period=False,
                 **kwargs):
        '''
        Parameters
        ----------
        empath_analyze_function: function (default=empath.Empath().analyze)
            Function that produces a dictionary mapping Empath categories to

        Other parameters from FeatsFromSpacyDoc.__init__
        '''
        self._lexicon_df = self._download_and_parse_general_inquirer()
        super(FeatsFromGeneralInquirer, self).__init__(use_lemmas,
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
specificed entities, instead of labeled by their lower case orthographic
        form or lemma, will be labeled by their entity type.

        Parameters
        ----------
        entity_types : set of entity types outputted by spaCy
          'TIME', 'WORK_OF_ART', 'PERSON', 'MONEY', 'ORG', 'ORDINAL', 'DATE',
          'CARDINAL', 'LAW', 'QUANTITY', 'GPE', 'PERCENT'

        Returns
        ---------
        self
        '''
        assert type(entity_types) == set
        self._entity_types_to_censor = entity_types
        self._feats_from_spacy_doc = FeatsFromSpacyDoc(
            use_lemmas=self._use_lemmas,
            entity_types_to_censor=self._entity_types_to_censor
        )
        return self
github JasonKessler / scattertext / scattertext / features / FeatsFromSpacyDocOnlyNounChunks.py View on Github external
from collections import Counter

from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc


class FeatsFromSpacyDocOnlyNounChunks(FeatsFromSpacyDoc):
	'''
	Just returns noun chunks from spaCy
	'''

	def get_feats(self, doc):
		'''
		Parameters
		----------
		doc, Spacy Docs

		Returns
		-------
		Counter noun chunk -> count
		'''
		return Counter([str(c).lower() for c in doc.noun_chunks])