Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_conversion_from_bytes_14_last_page_12(self):
start_time = time.time()
with open("./tests/test_14.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(pdf_file.read(), last_page=12)
self.assertTrue(len(images_from_bytes) == 12)
print(
"test_conversion_from_bytes_14_last_page_12: {} sec".format(
(time.time() - start_time) / 14.0
)
def test_not_locked_pdf(self):
start_time = time.time()
with TemporaryDirectory() as path:
with open("./tests/test.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(
pdf_file.read(), output_folder=path, fmt=".jpg", userpw="pdf2image"
)
self.assertTrue(len(images_from_bytes) == 1)
[im.close() for im in images_from_bytes]
print(
"test_locked_pdf_with_userpw_only: {} sec".format(time.time() - start_time)
)
def test_conversion_from_bytes_14(self):
start_time = time.time()
with open("./tests/test_14.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(pdf_file.read())
self.assertTrue(len(images_from_bytes) == 14)
print(
"test_conversion_from_bytes_14: {} sec".format(
(time.time() - start_time) / 14.0
)
def test_conversion_from_bytes_using_dir_241(self): # pragma: no cover
start_time = time.time()
with TemporaryDirectory() as path:
with open("./tests/test_241.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(
pdf_file.read(), output_folder=path
)
self.assertTrue(len(images_from_bytes) == 241)
[im.close() for im in images_from_bytes]
print(
"test_conversion_from_bytes_using_dir_241: {} sec".format(
(time.time() - start_time) / 241.0
)
def test_conversion_from_bytes_with_quality_and_progressive_and_optimize(self):
start_time = time.time()
with open("./tests/test.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(
pdf_file.read(),
fmt="jpg",
jpegopt={"quality": 100, "progressive": True, "optimize": True},
)
self.assertTrue(len(images_from_bytes) == 1)
print(
"test_conversion_from_bytes_with_quality_and_progressive_and_optimize: {} sec".format(
time.time() - start_time
)
def test_conversion_to_grayscale_from_bytes(self):
start_time = time.time()
with open("./tests/test_14.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(pdf_file.read(), grayscale=True)
self.assertTrue(images_from_bytes[0].mode == "L")
print(
"test_conversion_to_grayscale_from_bytes_14: {} sec".format(
(time.time() - start_time) / 14.0
)
tmp = obj_url.split('/')[-1].split('?')
if len(tmp) <= 1:
filename = ''.join(tmp)
else:
filename = ''.join(tmp[:-1])
content_type = key.split('-')[0]
ext = filename.split('.')[-1]
if filename is not None and buf is not None:
if content_type == 'image':
try:
buf.seek(0)
_ = Image.open(buf)
return [WrappaImage(
payload=buf.getvalue(), ext=ext, name=filename)]
except:
imgs = convert_from_bytes(buf.getvalue())
for i, img in enumerate(imgs):
buf = io.BytesIO()
img.save(buf, format='JPEG')
buf.flush()
imgs[i] = WrappaImage(
payload=buf.getvalue(),
ext='jpeg',
name='{}-{}.jpeg'.format(
filename.split('.')[0], str(i)
)
)
return imgs
data = {
'payload': buf.getvalue(),
'ext': ext,
'name': filename
\end{preview}
\end{document}
""".replace('[]', latex)
pdf = build_pdf(latex)
if format == 'file':
filename = filename or '/tmp/{}.pdf'.format(''.join(np.random.choice(list('abcdef'), 10, replace=True)))
if filename.endswith('.pdf'):
with open(filename, 'wb') as f:
f.write(pdf.data)
elif filename.endswith('.png'):
from pdf2image import convert_from_bytes
image = convert_from_bytes(pdf.data)
imageio.imwrite(filename, image)
return filename
elif format == 'bytes':
return pdf
elif format == 'array':
from pdf2image import convert_from_bytes
return np.array(convert_from_bytes(pdf.data)[0])
elif format == 'fig':
from pdf2image import convert_from_bytes
from matplotlib.figure import Figure
dpi, scale = 300, 0.75
image = np.array(convert_from_bytes(pdf.data, dpi=dpi)[0])
def convert_pdf_to_jpeg(pdf: typing.IO[bytes], preview_size: ImgDims) -> BytesIO:
pdf_content = pdf.read()
images = convert_from_bytes(pdf_content)
output = BytesIO()
for image in images:
resize_dims = compute_resize_dims(ImgDims(image.width, image.height), preview_size)
resized = image.resize((resize_dims.width, resize_dims.height), resample=True)
resized.save(output, format="JPEG")
output.seek(0, 0)
return output
def process_hocr_pdf(savepath, bind=True):
def convert_png(png):
ocr_output = pytesseract.image_to_pdf_or_hocr(png, extension='pdf')
with open(png + '.pdf', 'wb') as f:
f.write(ocr_output)
return png + '.pdf'
with open(savepath, 'rb') as inputfile:
basename = os.path.basename(savepath)
filename, filetype = os.path.splitext(basename)
logger.info('processing file ' + savepath)
with tempfile.TemporaryDirectory() as path:
pngs = convert_from_bytes(inputfile.read(), dpi=300, output_folder=path)
pdfs = []
for i, png in enumerate(pngs):
png_filename = filename + '_' + str(i + 1) + '.png'
png.save(os.path.join(path, png_filename))
pdfs.append(convert_png(os.path.join(path, png_filename)))
outputfilename = filename + '_hocr.pdf'
subprocess.call(['pdftk'] + [pdf for pdf in pdfs] + ['cat', 'output', os.path.join('./tmp', outputfilename)])
logger.info('processed file ' + outputfilename)
return { 'output': outputfilename }