Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

taxable and total amoutn #7

Open
Elanchezhian2712 opened this issue Feb 2, 2025 · 0 comments
Open

taxable and total amoutn #7

Elanchezhian2712 opened this issue Feb 2, 2025 · 0 comments

Comments

@Elanchezhian2712
Copy link
Owner

from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import google.generativeai as genai
import PIL.Image
from io import BytesIO
import json
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
from urllib.parse import urlparse
import time
import asyncio

app = FastAPI()


# token =  "AIzaSyBW9ot7RVqj2jWgnIncwVC1V8yfXW7BSsc
# token = "AIzaSyBLnWtYXmw9stMlymaxO4J_ZxhePBm-uMw"
token ="AIzaSyCooh3K9Y1xCpr-YeN0CUJ_8eHvdxqnWOg"
genai.configure(api_key=token)

# Gemini 1.5 Pro and 1.5 Flash support direct PDF processing up to 3,600 pages and images.
# They accept PDF files with MIME type 'application/pdf' and images with 'image/png', 'image/jpeg', etc.
model = genai.GenerativeModel("gemini-1.5-flash")
# model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-1219")

ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.pdf', '.jfif', '.csv', '.xlsx']

def is_allowed_file(filename: str) -> bool:
    return any(filename.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS)

def is_valid_url(link: str) -> bool:
    if link == "-" or not link:
        return False
    parsed_url = urlparse(link)
    if parsed_url.scheme and parsed_url.netloc:
        return any(link.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS)
    return False

# Function to format the image for input to the Gemini model
def image_format(image_bytes):
    try:
        image_parts = { # Changed to return a dict instead of list
            "mime_type": "image/png",  # Supported mime types: PNG, JPEG, WEBP
            "data": image_bytes
        }
        return image_parts
    except Exception as e:
            raise Exception(f"Error formatting image: {e}")

@app.post("/extract-invoice")
async def extract_invoice(file: UploadFile = File(...)):
    filename = file.filename

    if not is_allowed_file(filename):
        raise HTTPException(
            status_code=400,
            detail="Invalid file type. Allowed formats are: .jpg, .jpeg, .png, .pdf, .jfif, .csv, .xlsx"
        )

    try:
        file_bytes = await file.read()

        if filename.lower().endswith(('.csv', '.xlsx')):
            if filename.lower().endswith('.csv'):
                df = pd.read_csv(BytesIO(file_bytes))
            elif filename.lower().endswith('.xlsx'):
                df = pd.read_excel(BytesIO(file_bytes))

            if "Invoice link" not in df.columns:
                raise HTTPException(status_code=400, detail="Invoice link column not found")

            extracted_data = []
            non_extracted_count = 0
            request_count = 0

            for idx, link in enumerate(df['Invoice link'], 1):
                if not is_valid_url(link):
                    non_extracted_count += 1
                    print(f"Skipping invalid link: {link}")
                    continue

                # Rate limiting logic: Wait after every 10 requests
                if request_count >= 10:
                    print(f"Processed {request_count} requests in the last minute. Waiting for the next minute...")
                    await asyncio.sleep(60)  # Wait for 1 minute
                    request_count = 0  # Reset the count after waiting

                data = await process_invoice_link(link)
                extracted_data.append(data)
                request_count += 1
                print(f"Extracted {idx} invoice(s) from the provided link.")

            return JSONResponse(content={"extracted_data": extracted_data})

        elif filename.lower().endswith('.pdf'):
            pdf_part = {
                "mime_type": "application/pdf",
                "data": file_bytes
            }
            extracted_data = await process_content(pdf_part)

        else:  # Handling image files (jpg, jpeg, png, jfif)
            pil_image = PIL.Image.open(BytesIO(file_bytes))
            pil_image = pil_image.convert("L")
            image_bytes = BytesIO()
            pil_image.save(image_bytes, format="PNG")  # Convert image to bytes
            image_part = image_format(image_bytes.getvalue())
            extracted_data = await process_content(image_part)

        return JSONResponse(content={"extracted_data": extracted_data})

    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})


async def process_invoice_link(link: str):
    retries = 20
    retry_delay = 10

    for attempt in range(retries):
        try:
            response = requests.get(link)

            if response.status_code == 200:
                file_bytes = response.content
                file_ext = link.split('.')[-1].lower()

                if file_ext in ['jpg', 'jpeg', 'png', 'jfif']:
                    pil_image = PIL.Image.open(BytesIO(file_bytes))
                    pil_image = pil_image.convert("L")
                    image_bytes = BytesIO()
                    pil_image.save(image_bytes, format="PNG") # Convert image to bytes
                    image_part = image_format(image_bytes.getvalue())
                    return await process_content(image_part)

                elif file_ext == 'pdf':
                    pdf_part = {
                        "mime_type": "application/pdf",
                        "data": file_bytes
                    }
                    return await process_content(pdf_part)
                else:
                    raise HTTPException(status_code=400, detail="Unsupported file format in invoice link")

            elif response.status_code == 429:
                print(f"Quota exceeded. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                raise HTTPException(status_code=400, detail="Failed to fetch the file from the provided link")

        except Exception as e:
            if attempt == retries - 1:
                raise HTTPException(status_code=400, detail=f"Error processing invoice link: {str(e)}")
            else:
                time.sleep(retry_delay)


# Renamed process_pdf to process_content and made it handle both PDF and image parts
async def process_content(content_part):
    system_prompt = """
        You are the best analyst specialist in comprehending for tax invoice bills. Analyze the Input images or pdf in the form of tax invoice will be provided to you, and your task is to respond to questions based on the content of the input image or pdf.
        """

    prompt = """
        **Task:** Extract information from a "TAX INVOICE" document.

        **Objective:** Accurately extract specific fields from a tax invoice, with a CRITICAL focus on correctly identifying Supplier vs. Buyer information, especially GSTINs.

        **Understanding GSTIN (GST Number) and PAN:**

        Before extracting GSTINs, it's important to understand their format and relationship to PAN (Permanent Account Number):

        1. **GSTIN Structure:** A GSTIN (Goods and Services Tax Identification Number) is a 15-digit alphanumeric code.
        2. **GSTIN Example:**  For example: `08ABCDE9999F1Z8`
        3. **State Code (Digits 1-2):** The first two digits represent the state code. For example, `08` indicates Rajasthan.
        4. **PAN (Digits 3-12):** The next ten digits are the Permanent Account Number (PAN) of the business entity.
        5. **Entity Number (Digit 13):** The 13th digit indicates the number of businesses registered under the same PAN within a state.
        6. **Unique Suffix (Digits 14-15):** The last two digits are system-generated unique characters.

        7. **PAN Format:** A PAN is a 10-digit alphanumeric identifier, typically in the format `ABCDE1234F`.  It will usually be present in the Supplier details section of the invoice, sometimes labeled as "PAN No." or "PAN".

        **Instructions:**

        1. **Document Type:** You are processing a "TAX INVOICE".
        2. **Focus:**  Identify and extract information *only* from the designated sections of the invoice.
        3. **Accuracy:** Prioritize accuracy and exact matches, especially for alphanumeric codes like GSTIN and Invoice Numbers.
        4. **Field Specificity:**  Follow the detailed instructions for each field below precisely. Do not extract information from irrelevant sections or infer values.
        5. **Supplier vs. Buyer Distinction (CRITICAL):** Pay very close attention to differentiating between Supplier (Seller/Invoice Issuer) and Buyer (Customer/Bill To) information. This is especially important for GSTIN and Company Name.
        6. **"Curefoods India Pvt Ltd" Handling:** If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" appears in the invoice, treat it as the **Buyer** and *never* as the **Supplier**.
        7. **Default Values:** If a requested field cannot be unambiguously identified, return the specified default value (e.g., "-", "0", "0%").

        **Fields to Extract (with detailed instructions):**

        * **Supplier Company Name:**
            * **Instruction:** Locate the legal business name of the invoice issuer. This is typically at the top of the invoice, often near a logo, in the header, and associated with the Supplier GSTIN and address. Identify the *seller's* or *"Bill From"* company name.
            * **Exclusion:** Do *not* extract the buyer's name or any other company name mentioned elsewhere on the invoice.
            * **Buyer Check:**  If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" is found, it is the buyer, *not* the supplier.

        * **Supplier GSTIN:**
            * **CRITICAL INSTRUCTION:** **Identify the GSTIN of the COMPANY ISSUING THIS INVOICE (the Seller/Supplier).**  This GSTIN *must* be found within the **SUPPLIER DETAILS SECTION** of the invoice, typically in the header area, near the Supplier Company Name and Address.
            * **GSTIN Format Reminder:** Remember, a GSTIN is a 15-digit alphanumeric code like `08ABCDE9999F1Z8`. Look for values that match this pattern.
            * **Location Keywords for Supplier Section:** Look for headings or labels like "Supplier Details," "Seller Details," "Invoice From," or simply the company logo and address at the **top of the document**.  The Supplier GSTIN will be in *this* section.
            * **Keywords for GSTIN (Supplier Section):**  Within the Supplier Section, look for labels such as "GSTIN," "GSTIN/UIN," "GST Number," or "GST No."  The GSTIN value *immediately following* these keywords in the **Supplier Section** is the Supplier GSTIN.
            * **Additional Check - PAN Proximity:** **After identifying a potential Supplier GSTIN, check if a PAN number (10-digit alphanumeric string like `ABCDE1234F`, potentially labeled "PAN No." or "PAN") is present *nearby* within the Supplier Details section.** The presence of a PAN number in close proximity strengthens the likelihood that you have correctly identified the Supplier GSTIN. Treat this as an additional confirmation step.
            * **ABSOLUTE EXCLUSION:** **DO NOT EXTRACT ANY GSTIN FROM THE "BUYER," "BILL TO," or "CUSTOMER" SECTIONS as the Supplier GSTIN.**  These are Buyer GSTINs and are *incorrect* for the Supplier GSTIN field.
            * **"Curefoods India Pvt Ltd" Rule REINFORCED:**  If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" is present, it is ALWAYS the **Buyer**.  Ignore any GSTIN associated with these names when looking for the **Supplier GSTIN**.
            * **Format:** Ensure the extracted value is in the alphanumeric GSTIN format.
            * **Missing Value:** If you **cannot unambiguously** identify a GSTIN within the **SUPPLIER DETAILS SECTION**, return "-".  Do not guess or extract from other sections.

        * **Address:**
            * **Instruction:** Extract the complete supplier address block, including all lines.
            * **Exclusion:** Do not include buyer address details.

        * **Invoice No:**
            * **Instruction:** Locate the alphanumeric invoice identifier, typically near the "Invoice Date" and labeled "Invoice No.", "Invoice Number", etc.
            * **Format:** Capture the exact alphanumeric sequence.

        * **Invoice Date:**
            * **Instruction:** Extract the invoice issue date.
            * **Format:** Convert to [DAY/MONTH/YEAR] format (e.g., [01/12/2024]).

        * **Supplier State Code:**
            * **Instruction:** Extract the numeric state code *only* from the explicit "State Code" field.
            * **Source:** Do *not* infer or fetch from the address.
            * **Missing Value:** If "State Code" field is missing or empty, return "-".

        * **Buyer Company Name:**
            * **Instruction:** Locate the company name the invoice is billed to, typically under headings like "Bill to:", "Billed to:", "Consignee to:", or "Shipped to:".
            * **Missing Value:** If not found, return "-".

        * **Buyer State:**
            * **Instruction:** Extract the full name of the buyer's state, usually in the "BILL TO" section.
            * **Missing Value:** If not found, return "-".

        * **Buyer GSTIN:**
            * **CRITICAL INSTRUCTION:** **Identify the GSTIN of the COMPANY BEING BILLED (the Buyer/Customer).** This GSTIN *must* be found within the **BUYER DETAILS SECTION** of the invoice.
            * **GSTIN Format Reminder:** Remember, a GSTIN is a 15-digit alphanumeric code like `08ABCDE9999F1Z8`. Look for values that match this pattern.
            * **Location Keywords for Buyer Section:** Look for headings or labels like "Buyer Details," "Bill To," "Billed To," "Customer Details," "Consignee Details," or "Shipped To." The Buyer GSTIN will be in *this* section.
            * **Keywords for GSTIN (Buyer Section):** Within the Buyer Section, look for labels such as "GSTIN," "GSTIN/UIN," "GST Number," or "GST No." The GSTIN value *immediately following* these keywords in the **Buyer Section** is the Buyer GSTIN.
            * **ABSOLUTE EXCLUSION:** **DO NOT EXTRACT ANY GSTIN FROM THE "SUPPLIER," "SELLER," or "BILL FROM" SECTIONS as the Buyer GSTIN.** These are Supplier GSTINs and are *incorrect* for the Buyer GSTIN field.
            * **Scope:** Extract *only* the alphanumeric GSTIN value, excluding labels like "GSTIN:". Ensure it's in GSTIN format.
            * **Missing Value:** If a valid Buyer GSTIN **cannot be unambiguously** identified within the **BUYER DETAILS SECTION**, return "-". Do not guess or extract from other sections.

        * **Taxable Value:**
            * **Instruction:** Extract the total taxable value before taxes. If the "Taxable Value" is explicitly mentioned, use that value. If it is not explicitly mentioned, return only "0" and do not include the total invoice amount in the taxable value.
            * **Missing Value:** If neither "Taxable Value" are not found, return "0"..

        * **Tax Rate:**
            * **Instruction:** Extract the applicable tax rate. Prioritize CGST or SGST rates. If both CGST and SGST are present and identical, return that rate. If not found or different, check for IGST or "Integrated Tax".
            * **Format:** Output as a percentage value (e.g., "5%").
            * **Inconsistent/Missing Rates:** If rates are missing or inconsistent across CGST, SGST, IGST, and "Integrated Tax", return "0%".

        * **CGST:**
            * **Instruction:** Extract the CGST amount.
            * **Missing Value:** If not found, return "0".

        * **SGST:**
            * **Instruction:** Extract the SGST amount.
            * **Missing Value:** If not found, return "0".

        * **IGST:**
            * **Instruction:** Extract the IGST amount.
            * **Missing Value:** If not found, return "0".

        * **Discount:**
            * **Instruction:** Extract any explicitly mentioned discount.
            * **Missing Value:** If none found, return "0".

        * **Total Amount:**
            * **Instruction:** Total Amount of invoice bill, including taxes.
            * **Missing Value:** If missing, return "0".


        **Output Format:**

        Return the extracted information in the following JSON format:

        ```json
        {
            "Supplier Company Name": "<supplier_company_name>",
            "Supplier GSTIN": "<supplier_gstin>",
            "Address": "<address>",
            "Invoice No": "<invoice_no>",
            "Invoice Date": "<invoice_date>",
            "Supplier State Code": "<supplier_state_code>",
            "Buyer Company Name": "<buyer_company_name>",
            "Buyer State": "<buyer_state>",
            "Buyer GST": "<buyer_gst>",
            "Taxable Value": "<taxable_value>",
            "Tax Rate": "<tax_rate>",
            "CGST": "<cgst>",
            "SGST": "<sgst>",
            "IGST": "<igst>",
            "Discount": "<discount>",
            "Total Amount": "<total_amount>"
        }
        ```
        """

    try:
        response = await model.generate_content_async([system_prompt,prompt, content_part])
        raw_output = response.text
    except Exception as e:
        return {}

    start_index = raw_output.find("{")
    end_index = raw_output.rfind("}")

    if start_index == -1 or end_index == -1:
       return {}

    json_content = raw_output[start_index:end_index + 1]

    try:
        parsed_data = json.loads(json_content)
    except json.JSONDecodeError as e:
        return {}

    expected_fields = [
        "Supplier Company Name", "Supplier GSTIN", "Address", "Invoice No", "Invoice Date",
        "Supplier State Code","Buyer Company Name", "Buyer State", "Buyer GST", "Taxable Value",
        "Tax Rate", "CGST", "SGST", "IGST", "Discount", "Total Amount"
    ]
    for field in expected_fields:
        if field not in parsed_data or parsed_data[field] is None:
            parsed_data[field] = "-"

    # Address Cleaning
    if "Address" in parsed_data and parsed_data["Address"] != "-":
        address_parts = parsed_data["Address"].split()
        parsed_data["Address"] = " ".join(address_parts).replace(',',',').replace("\n", ", ").replace(",,", ", ")

    # Rate Cleaning
    if "Tax Rate" in parsed_data:
        if not parsed_data["Tax Rate"].endswith("%"):
            parsed_data["Tax Rate"] = "0%"

    # Discount Cleaning
    if "Discount" in parsed_data:
        if parsed_data["Discount"] == "NILL" or parsed_data["Discount"] == "":
            parsed_data["Discount"] = "0"

    # Buyer State Validation
    if "Buyer State" in parsed_data and parsed_data["Buyer State"] != "-":
        buyer_state = parsed_data["Buyer State"].strip()

        # Check if the Buyer State contains any digits
        if any(char.isdigit() for char in buyer_state):
            parsed_data["Buyer State"] = "-"
        else:
            valid_states = [
                "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chhattisgarh", "Goa",
                "Gujarat", "Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka", "Kerala",
                "Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram", "Nagaland",
                "Odisha", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana", "Tripura",
                "Uttar Pradesh", "Uttarakhand", "West Bengal"
            ]

            # Ensure only valid state names are allowed
            if buyer_state not in valid_states:
                parsed_data["Buyer State"] = "-"

    # Supplier Company Name Cleaning
    if "Supplier Company Name" in parsed_data and parsed_data["Supplier Company Name"]:
        if parsed_data.get("Supplier Company Name", "").strip().lower().startswith("curefoods"):
            parsed_data["Supplier Company Name"] = "-"
        parsed_data["Supplier Company Name"] = parsed_data["Supplier Company Name"].replace("\n", " ").strip()

    if "Buyer GST" in parsed_data and parsed_data["Buyer GST"] != "-":
        buyer_gst = parsed_data["Buyer GST"].strip()
        buyer_gst = buyer_gst.replace("GSTIN :", "").replace("GSTIN:", "").replace("GST No.", "").strip() # Remove common prefixes
        if not buyer_gst.isalnum() or len(buyer_gst) != 15: # Basic GSTIN validation (alphanumeric and length 15 - you might need more robust validation)
            parsed_data["Buyer GST"] = "-" # If it's not a valid GSTIN format, return "-"
        else:
            parsed_data["Buyer GST"] = buyer_gst

    return parsed_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant