Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_toascii_works_with_empty_unicode_or_bytes():
assert u'' == text.toascii(b'', translit=False)
assert u'' == text.toascii(u'', translit=True)
assert u'' == text.toascii(b'', translit=False)
assert u'' == text.toascii(u'', translit=True)
line = remove_punctuation(u' ', line)
# normalize spaces around commas
line = line.replace(u' , ', u', ')
# remove ASCII "line decorations"
# such as in --- or === or !!! or *****
line = remove_ascii_decorations(u' ', line)
# in apache'>Copyright replace ">" by "> "
line = line.replace(u'>', u'> ').replace(u'<', u' <')
# normalize to ascii text
if to_ascii:
line = toascii(line, translit=True)
# normalize to use only LF as line endings so we can split correctly
# and keep line endings
line = unixlinesep(line)
# strip verbatim back slash and comment signs again at both ends of a line
# FIXME: this is done at the start of this function already
line = line.strip(u'\\/*#%;')
# normalize spaces
line = u' '.join(line.split())
return line
def process_shingles(self, shingle, weighted_list):
"""
Modify weighted list wrt to shingle
"""
# convert other encodings to ascii. See #1690.
shingle = toascii(shingle)
hash = hashlib.md5(shingle.encode()).digest()
result = self.bitarray_from_bytes(hash)
for idx, bit in enumerate(result):
if bit:
weighted_list[idx] += 1
else:
weighted_list[idx] -= 1
return weighted_list
def strings_from_file(location, buff_size=1024 * 1024, ascii=False, clean=True, min_len=MIN_LEN):
"""
Yield unicode strings made only of ASCII characters found in file at location.
Process the file in chunks (to limit memory usage). If ascii is True, strings
are converted to plain ASCII "str or byte" strings instead of unicode.
"""
with open(location, 'rb') as f:
while 1:
buf = f.read(buff_size)
if not buf:
break
for s in strings_from_string(buf, clean=clean, min_len=min_len):
if ascii:
s = toascii(s)
s = s.strip()
if len(s) >= min_len:
yield s
def generated_code(location):
'''
Return a line of extracted text from a file if that file is likely
generated source code.
for each of the the first few lines of a source code file
if generated keywords are found in the line as lowercase
yield the line text as a 'potentially_ generated' annotation
'''
T = typecode.contenttype.get_type(location)
if not T.is_text:
return
with open(location, 'rb') as filein:
for line in islice(filein, max_lines):
text = commoncode.text.toascii(line.strip())
textl = text.lower()
if any(kw in textl for kw in generated_keywords):
# yield only the first 100 chars
yield text[:100]
def display_extract_summary():
"""
Display a summary of warnings and errors if any.
"""
has_warnings = False
has_errors = False
summary = []
for xev in extract_results:
has_errors = has_errors or bool(xev.errors)
has_warnings = has_warnings or bool(xev.warnings)
source = fileutils.as_posixpath(xev.source)
if not isinstance(source, compat.unicode):
source = toascii(source, translit=True).decode('utf-8', 'replace')
source = get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir)
for e in xev.errors:
echo_stderr('ERROR extracting: %(source)s: %(e)s' % locals(), fg='red')
for warn in xev.warnings:
echo_stderr('WARNING extracting: %(source)s: %(warn)s' % locals(), fg='yellow')
summary_color = 'green'
if has_warnings:
summary_color = 'yellow'
if has_errors:
summary_color = 'red'
echo_stderr('Extracting done.', fg=summary_color, reset=True)
def extract_event(item):
"""
Display an extract event.
"""
if quiet:
return ''
if not item:
return ''
source = item.source
if not isinstance(source, compat.unicode):
source = toascii(source, translit=True).decode('utf-8', 'replace')
if verbose:
if item.done:
return ''
line = source and get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir) or ''
else:
line = source and fileutils.file_name(source) or ''
if not isinstance(line, compat.unicode):
line = toascii(line, translit=True).decode('utf-8', 'replace')
return 'Extracting: %(line)s' % locals()
Note: the location can be a list of lines for testing convenience.
"""
if TRACE:
from pprint import pformat
loc = pformat(location)
logger_debug('find(location=%(loc)r,\n patterns=%(patterns)r)' % locals())
for lineno, line in analysis.numbered_text_lines(location):
for key, pattern in patterns:
for match in pattern.findall(line):
if TRACE:
logger_debug('find: yielding match: key=%(key)r, '
'match=%(match)r,\n line=%(line)r' % locals())
yield key, toascii(match), line, lineno