How to use the textblob.compat.unicode function in textblob

To help you get started, we’ve selected a few textblob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sloria / TextBlob / textblob / _text.py View on Github external
s[i][j][0] = s[i][j][0].replace("/", "&slash;")
                s[i][j] = "/".join(s[i][j])
            s[i] = " ".join(s[i])
        s = "\n".join(s)
        s = TaggedString(unicode(s), format, language=kwargs.get("language", self.language))
        return s


#--- TAGGED STRING ---------------------------------------------------------------------------------
# Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes.
# The pattern.text.tree.Text class uses this attribute to determine the token format and
# transform the tagged string to a parse tree of nested Sentence, Chunk and Word objects.

TOKENS = "tokens"

class TaggedString(unicode):

    def __new__(self, string, tags=["word"], language=None):
        """ Unicode string with tags and language attributes.
            For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]).
        """
        # From a TaggedString:
        if isinstance(string, unicode) and hasattr(string, "tags"):
            tags, language = string.tags, string.language
        # From a TaggedString.split(TOKENS) list:
        if isinstance(string, list):
            string = [[[x.replace("/", "&slash;") for x in token] for token in s] for s in string]
            string = "\n".join(" ".join("/".join(token) for token in s) for s in string)
        s = unicode.__new__(self, string)
        s.tags = list(tags)
        s.language = language
        return s
github sloria / TextBlob / textblob / blob.py View on Github external
"""Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
        else:
            return [(Word(word, pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]
github sloria / TextBlob / textblob / blob.py View on Github external
def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
        else:
            return [(Word(word, pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]
github sloria / TextBlob / textblob / _text.py View on Github external
def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements, linebreak=r"\n{2,}"):
    """ Returns a list of sentences. Each sentence is a space-separated string of tokens (words).
        Handles common cases of abbreviations (e.g., etc., ...).
        Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence.
        Headings without an ending period are inferred by line breaks.
    """
    # Handle periods separately.
    punctuation = tuple(punctuation.replace(".", ""))
    # Handle replacements (contractions).
    for a, b in list(replace.items()):
        string = re.sub(a, b, string)
    # Handle Unicode quotes.
    if isinstance(string, unicode):
        string = unicode(string).replace("“", " “ ")\
                                .replace("”", " ” ")\
                                .replace("‘", " ‘ ")\
                                .replace("’", " ’ ")\
                                .replace("'", " ' ")\
                                .replace('"', ' " ')
    # Collapse whitespace.
    string = re.sub("\r\n", "\n", string)
    string = re.sub(linebreak, " %s " % EOS, string)
    string = re.sub(r"\s+", " ", string)
    tokens = []
    for t in TOKEN.findall(string+" "):
        if len(t) > 0:
            tail = []
            while t.startswith(punctuation) and \
              not t in replace:
                # Split leading punctuation.
github sloria / TextBlob / textblob / _text.py View on Github external
if chunks:
            format.extend(("chunk", "preposition"))
        if relations:
            format.append("relation")
        if lemmata:
            format.append("lemma")
        # Collapse raw list.
        # Sentences are separated by newlines, tokens by spaces, tags by slashes.
        # Slashes in words are encoded with &slash;
        for i in range(len(s)):
            for j in range(len(s[i])):
                s[i][j][0] = s[i][j][0].replace("/", "&slash;")
                s[i][j] = "/".join(s[i][j])
            s[i] = " ".join(s[i])
        s = "\n".join(s)
        s = TaggedString(unicode(s), format, language=kwargs.get("language", self.language))
        return s
github sloria / TextBlob / text__init__.py View on Github external
def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(unicode(s), *args, **kwargs)
github sloria / TextBlob / textblob / _text.py View on Github external
def decode_string(v, encoding="utf-8"):
    """ Returns the given value as a Unicode string (if possible).
    """
    if isinstance(encoding, basestring):
        encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore"))
    if isinstance(v, binary_type):
        for e in encoding:
            try:
                return v.decode(*e)
            except:
                pass
        return v
    return unicode(v)
github sloria / TextBlob / text__init__.py View on Github external
def polarity(s, **kwargs):
    """ Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
    """
    return sentiment(unicode(s), **kwargs)[0]
github sloria / TextBlob / text__init__.py View on Github external
def parsetree(s, *args, **kwargs):
    """ Returns a parsed Text from the given string.
    """
    return Text(parse(unicode(s), *args, **kwargs))