-- Living Mobile --: PymPDF - read page by page and extract images

Tuesday, October 29, 2024

PymPDF - read page by page and extract images

def navigate_page_by_page_pympdf():

    ## navigating page by page 
   doc = pymupdf.open("deployment_guide.pdf") # open a document
   out = open("output.txt", "wb") # create a text output
   for page in doc: # iterate the document pages
        text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
        print("Text read is ",text)
        # out.write(text) # write text of page
        # out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
   out.close()

def extract_images_pympdf():
    doc = pymupdf.open("deployment_guide.pdf") # open a document

    for page_index in range(len(doc)): # iterate over pdf pages
        page = doc[page_index] # get the page
        image_list = page.get_images()

        # print the number of images found on the page
        if image_list:
            print(f"Found {len(image_list)} images on page {page_index}")
        else:
            print("No images found on page", page_index)

        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
            xref = img[0] # get the XREF of the image
            pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

            if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

            pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
            pix = None

-- Living Mobile --

Tuesday, October 29, 2024

PymPDF - read page by page and extract images

No comments:

Post a Comment

Followers

Blog Archive

About Me