def navigate_page_by_page_pympdf():
    ## navigating page by page 
   doc = pymupdf.open("deployment_guide.pdf") # open a document
   out = open("output.txt", "wb") # create a text output
   for page in doc: # iterate the document pages
        text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
        print("Text read is ",text)
        # out.write(text) # write text of page
        # out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
   out.close()
def extract_images_pympdf():
    doc = pymupdf.open("deployment_guide.pdf") # open a document
    for page_index in range(len(doc)): # iterate over pdf pages
        page = doc[page_index] # get the page
        image_list = page.get_images()
        # print the number of images found on the page
        if image_list:
            print(f"Found {len(image_list)} images on page {page_index}")
        else:
            print("No images found on page", page_index)
        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
            xref = img[0] # get the XREF of the image
            pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
            if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
            pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
            pix = None
 
No comments:
Post a Comment