Tuesday, October 1, 2024

AI way of extract text from PDF?

 async def document_analysis(filename: str) -> str:

    """

    Document Understanding

    Args:

        filename: pdf filename str

    """


    pdf = pdfium.PdfDocument(filename)

    images = []

    print("Retrieved PDF ",len(pdf))

    for i in range(len(pdf)):

        print("Iter count ", i)

        page = pdf[i]

        print("Got the page ", page)

        image = page.render(scale=8).to_pil()

        buffered = BytesIO()

        image.save(buffered, format="JPEG")

        img_byte = buffered.getvalue()

        img_base64 = base64.b64encode(img_byte).decode("utf-8")

        images.append(img_base64)


    text_of_pages = await asyncio.gather(*[parse_page_with_gpt(image) for image in images])

    print("Text of pages got")

    results = []


    extracted_texts = [doc for doc in text_of_pages]

    # Clean each string in the list and append to json_results

    for text in extracted_texts:

        results.append(text)

        

    return results




async def parse_page_with_gpt(base64_image: str) -> str:

    messages=[

        {

            "role": "system",

            "content": """

            

            You are a helpful assistant that extracts information from images.

            

            """

        },

        {

            "role": "user",

            "content": [

                {"type": "text", "text": "Extract information from image into text"},

                {

                    "type": "image_url",

                    "image_url": {

                        "url": f"data:image/jpeg;base64,{base64_image}",

                        "detail": "auto"

                    },

                },

            ],

        }

    ]

    response = await clienta.chat.completions.create(

        model=MODEL,

        messages=messages,

        temperature=0,

        max_tokens=4096,

    )

    return response.choices[0].message.content or ""


No comments:

Post a Comment