Tuesday, October 29, 2024

PymPDF - read page by page and extract images

 def navigate_page_by_page_pympdf():

## navigating page by page
doc = pymupdf.open("deployment_guide.pdf") # open a document
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
print("Text read is ",text)
# out.write(text) # write text of page
# out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()

def extract_images_pympdf():
doc = pymupdf.open("deployment_guide.pdf") # open a document

for page_index in range(len(doc)): # iterate over pdf pages
page = doc[page_index] # get the page
image_list = page.get_images()

# print the number of images found on the page
if image_list:
print(f"Found {len(image_list)} images on page {page_index}")
else:
print("No images found on page", page_index)

for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pix = None

No comments:

Post a Comment