Source code for matviz.doctools

import fitz  # PyMuPDF
import io
import os
from PIL import Image
import shutil


[docs] def extract_images_from_pdf(input_pdf_path: str, output_folder: str) -> str: """ Extracts all images from the given PDF and saves them in the specified output folder. Returns the path to a zipped folder containing all extracted images. """ doc = fitz.open(input_pdf_path) os.makedirs(output_folder, exist_ok=True) image_paths = [] for page_index in range(len(doc)): page = doc[page_index] images = page.get_images(full=True) for img_index, img in enumerate(images): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] image_path = os.path.join(output_folder, f"page{page_index + 1}_img{img_index + 1}.{image_ext}") with open(image_path, "wb") as img_file: img_file.write(image_bytes) image_paths.append(image_path) doc.close() # Zip the folder for easier distribution zip_path = shutil.make_archive(output_folder, 'zip', output_folder) return zip_path
[docs] def compress_pdf_images(input_pdf_path: str, output_pdf_path: str, dpi: int = 150, quality: int = 40) -> str: """ Compresses all images in the PDF by rendering pages as compressed images. Returns the path to the compressed PDF. dpi: controls the resolution of the page rendering. quality: JPEG quality setting (0-100). """ doc = fitz.open(input_pdf_path) compressed_pdf = fitz.open() for page_index in range(len(doc)): page = doc[page_index] pix = page.get_pixmap(dpi=dpi) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) img_byte_arr = io.BytesIO() img.save(img_byte_arr, format='JPEG', quality=quality) img_stream = img_byte_arr.getvalue() new_page = compressed_pdf.new_page(width=page.rect.width, height=page.rect.height) rect = fitz.Rect(0, 0, page.rect.width, page.rect.height) new_page.insert_image(rect, stream=img_stream) compressed_pdf.save(output_pdf_path) compressed_pdf.close() doc.close() return output_pdf_path