Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def other_punctuation():
"""Match other punctuation.
Match other punctuation to split on; punctuation that naturally
inserts a break in speech.
"""
punc = ''.join((
set(symbols.ALL_PUNC) -
set(symbols.TONE_MARKS) -
set(symbols.PERIOD_COMMA)))
return RegexBuilder(
pattern_args=punc,
pattern_func=lambda x: u"{}".format(x)).regex
def tone_marks(text):
"""Add a space after tone-modifying punctuation.
Because the `tone_marks` tokenizer case will split after a tone-modidfying
punctuation mark, make sure there's whitespace after.
"""
return PreProcessorRegex(
search_args=symbols.TONE_MARKS,
search_func=lambda x: u"(?<={})".format(x),
repl=' ').run(text)
def tone_marks():
"""Keep tone-modifying punctuation by matching following character.
Assumes the `tone_marks` pre-processor was run for cases where there might
not be any space after a tone-modifying punctuation mark.
"""
return RegexBuilder(
pattern_args=symbols.TONE_MARKS,
pattern_func=lambda x: u"(?<={}).".format(x)).regex
def tone_marks():
"""Keep tone-modifying punctuation by matching following character.
Assumes the `tone_marks` pre-processor was run for cases where there might
not be any space after a tone-modifying punctuation mark.
"""
return RegexBuilder(
pattern_args=symbols.TONE_MARKS,
pattern_func=lambda x: u"(?<={}).".format(x)).regex
def other_punctuation():
"""Match other punctuation.
Match other punctuation to split on; punctuation that naturally
inserts a break in speech.
"""
punc = ''.join(
set(symbols.ALL_PUNC) -
set(symbols.TONE_MARKS) -
set(symbols.PERIOD_COMMA) -
set(symbols.COLON))
return RegexBuilder(
pattern_args=punc,
pattern_func=lambda x: u"{}".format(x)).regex