Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_binarize_grayscale(self):
"""
Test binarization of mode 'L' images.
"""
with Image.open(os.path.join(resources, 'input.tif')) as im:
res = nlbin(im.convert('L'))
# calculate histogram and check if only pixels of value 0/255 exist
self.assertEqual(254, res.histogram().count(0), msg='Output not '
'binarized')
def test_binarize_no_bw(self):
"""
Tests binarization of image formats without a 1bpp mode (JPG).
"""
with Image.open(os.path.join(resources, 'input.jpg')) as im:
res = nlbin(im)
# calculate histogram and check if only pixels of value 0/255 exist
self.assertEqual(254, res.histogram().count(0), msg='Output not '
'binarized')
def test_not_binarize_bw(self):
"""
Test that mode '1' images aren't binarized again.
"""
with Image.open(os.path.join(resources, 'bw.png')) as im:
self.assertEqual(im, nlbin(im))
def test_binarize_tif(self):
"""
Tests binarization of RGB TIFF images.
"""
with Image.open(os.path.join(resources, 'input.tif')) as im:
res = nlbin(im)
# calculate histogram and check if only pixels of value 0/255 exist
self.assertEqual(254, res.histogram().count(0), msg='Output not '
'binarized')
def test_not_binarize_empty(self):
"""
Test that mode '1' images aren't binarized again.
"""
with Image.new('1', (1000,1000)) as im:
nlbin(im)
if td is None:
td = 'horizontal-lr'
else:
td = td.attrib['content']
im = None
dest_dict = {'output': output, 'idx': 0, 'src': fp.name, 'uuid': str(uuid.uuid4())}
for section in doc.xpath('//section'):
img = section.xpath('.//img')[0].get('src')
fd = BytesIO(base64.b64decode(img.split(',')[1]))
im = Image.open(fd)
if not im:
logger.info('Skipping {} because image not found'.format(fp.name))
break
if binarize:
im = binarization.nlbin(im)
for line in section.iter('li'):
if line.get('contenteditable') and (not u''.join(line.itertext()).isspace() and u''.join(line.itertext())):
dest_dict['idx'] = idx
dest_dict['uuid'] = str(uuid.uuid4())
logger.debug('Writing line {:06d}'.format(idx))
l_img = im.crop([int(x) for x in line.get('data-bbox').split(',')])
if rotate and td.startswith('vertical'):
im.rotate(90, expand=True)
l_img.save(('{output}/' + format + '.png').format(**dest_dict))
manifest.append((format + '.png').format(**dest_dict))
text = u''.join(line.itertext()).strip()
for func in text_transforms:
text = func(text)
with open(('{output}/' + format + '.gt.txt').format(**dest_dict), 'wb') as t:
t.write(text.encode('utf-8'))
idx += 1
def binarizer(threshold, zoom, escale, border, perc, range, low, high, base_image, input, output) -> None:
from kraken import binarization
try:
im = Image.open(input)
except IOError as e:
raise click.BadParameter(str(e))
message('Binarizing\t', nl=False)
try:
res = binarization.nlbin(im, threshold, zoom, escale, border, perc, range,
low, high)
form = None
ext = os.path.splitext(output)[1]
if ext in ['.jpg', '.jpeg', '.JPG', '.JPEG', '']:
form = 'png'
if ext:
logger.warning('jpeg does not support 1bpp images. Forcing to png.')
res.save(output, format=form)
except Exception:
message('\u2717', fg='red')
raise
message('\u2713', fg='green')
if prefill:
logger.info('Loading model {}'.format(prefill))
message('Loading RNN', nl=False)
prefill = models.load_any(prefill)
message('\u2713', fg='green')
with log.progressbar(images, label='Reading images') as bar:
for fp in bar:
logger.info('Reading {}'.format(fp.name))
im = Image.open(fp)
if im.mode not in ['1', 'L', 'P', 'RGB']:
logger.warning('Input {} is in {} color mode. Converting to RGB'.format(fp.name, im.mode))
im = im.convert('RGB')
logger.info('Binarizing page')
im_bin = binarization.nlbin(im)
im_bin = im_bin.convert('1')
logger.info('Segmenting page')
if not lines:
res = pageseg.segment(im_bin, text_direction, scale, maxcolseps, black_colseps, pad=pad)
else:
with open_file(lines, 'r') as fp:
try:
fp = cast(IO[Any], fp)
res = json.load(fp)
except ValueError as e:
raise click.UsageError('{} invalid segmentation: {}'.format(lines, str(e)))
if prefill:
it = rpred.rpred(prefill, im_bin, res)
preds = []
logger.info('Recognizing')
for pred in it: