Source code for matviz.doctools
import fitz # PyMuPDF
import io
import os
from PIL import Image
import shutil
[docs]
def extract_images_from_pdf(input_pdf_path: str, output_folder: str) -> str:
"""
Extracts all images from the given PDF and saves them in the specified output folder.
Returns the path to a zipped folder containing all extracted images.
"""
doc = fitz.open(input_pdf_path)
os.makedirs(output_folder, exist_ok=True)
image_paths = []
for page_index in range(len(doc)):
page = doc[page_index]
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_path = os.path.join(output_folder, f"page{page_index + 1}_img{img_index + 1}.{image_ext}")
with open(image_path, "wb") as img_file:
img_file.write(image_bytes)
image_paths.append(image_path)
doc.close()
# Zip the folder for easier distribution
zip_path = shutil.make_archive(output_folder, 'zip', output_folder)
return zip_path
[docs]
def compress_pdf_images(input_pdf_path: str, output_pdf_path: str, dpi: int = 150, quality: int = 40) -> str:
"""
Compresses all images in the PDF by rendering pages as compressed images.
Returns the path to the compressed PDF.
dpi: controls the resolution of the page rendering.
quality: JPEG quality setting (0-100).
"""
doc = fitz.open(input_pdf_path)
compressed_pdf = fitz.open()
for page_index in range(len(doc)):
page = doc[page_index]
pix = page.get_pixmap(dpi=dpi)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='JPEG', quality=quality)
img_stream = img_byte_arr.getvalue()
new_page = compressed_pdf.new_page(width=page.rect.width, height=page.rect.height)
rect = fitz.Rect(0, 0, page.rect.width, page.rect.height)
new_page.insert_image(rect, stream=img_stream)
compressed_pdf.save(output_pdf_path)
compressed_pdf.close()
doc.close()
return output_pdf_path