Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new invoice code #8

Open
Elanchezhian2712 opened this issue Feb 7, 2025 · 0 comments
Open

new invoice code #8

Elanchezhian2712 opened this issue Feb 7, 2025 · 0 comments

Comments

@Elanchezhian2712
Copy link
Owner

model = genai.GenerativeModel("gemini-1.5-pro")
# model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-01-21")

ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.pdf', '.jfif', '.csv', '.xlsx']

def is_allowed_file(filename: str) -> bool:
    return any(filename.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS)

def is_valid_url(link: str) -> bool:
    if link == "-" or not link:
        return False
    parsed_url = urlparse(link)
    if parsed_url.scheme and parsed_url.netloc:
        return any(link.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS)
    return False

# Function to format the image for input to the Gemini model
def image_format(image_bytes):
    try:
        image_parts = { # Changed to return a dict instead of list
            "mime_type": "image/png",  # Supported mime types: PNG, JPEG, WEBP
            "data": image_bytes
        }
        return image_parts
    except Exception as e:
            raise Exception(f"Error formatting image: {e}")


@app.post("/extract-invoice")
async def extract_invoice(file: UploadFile = File(...)):
    filename = file.filename

    if not is_allowed_file(filename):
        raise HTTPException(
            status_code=400,
            detail="Invalid file type. Allowed formats are: .jpg, .jpeg, .png, .pdf, .jfif, .csv, .xlsx"
        )

    extracted_data: List[Dict[str, Any]] = [] 
    request_count = 0  # Initialize request counter
    last_request_time = time.time()  # Store the time of the last request

    try:
        file_bytes = await file.read()

        if filename.lower().endswith(('.csv', '.xlsx')):
            if filename.lower().endswith('.csv'):
                df = pd.read_csv(BytesIO(file_bytes))
            elif filename.lower().endswith('.xlsx'):
                df = pd.read_excel(BytesIO(file_bytes))

            if "Invoice link" not in df.columns:
                raise HTTPException(status_code=400, detail="Invoice link column not found")

            non_extracted_count = 0

            for idx, link in enumerate(df['Invoice link'], 1):
                # Check rate limit: only 10 requests per minute
                current_time = time.time()
                if current_time - last_request_time > 60:  # If more than 60 seconds have passed
                    request_count = 0  # Reset the request counter
                    last_request_time = current_time

                if request_count >= 10:
                    print(f"Processed {request_count} requests in the last minute. Waiting for the next minute...")
                    # Wait for the next minute if the limit is reached
                    await asyncio.sleep(60 - (current_time - last_request_time))
                    last_request_time = time.time()  # Update the time after waiting
                    request_count = 0

                # Process invoice link
                if not is_valid_url(link):
                    non_extracted_count += 1
                    print(f"Skipping invalid link: {link}")
                    extracted_data.append({"status": "skipped", "link": link, "error": "Invalid URL"})
                    continue

                # Extract data from the link
                data = await process_invoice_link(link)
                extracted_data.append(data)
                request_count += 1
                print(f"Extracted {idx} invoice(s) from the provided link.")

            return JSONResponse(content={"extracted_data": extracted_data})

        elif filename.lower().endswith('.pdf'):
            pdf_part = {
                "mime_type": "application/pdf",
                "data": file_bytes
            }
            extracted_data = await process_content(pdf_part)

        else:  # Handling image files (jpg, jpeg, png, jfif)
            pil_image = PIL.Image.open(BytesIO(file_bytes))
            pil_image = pil_image.convert("L")
            image_bytes = BytesIO()
            pil_image.save(image_bytes, format="PNG")  
            image_part = image_format(image_bytes.getvalue())
            extracted_data.append(await process_content(image_part))

        return JSONResponse(content={"extracted_data": extracted_data})

    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})



async def process_invoice_link(link: str):
    retries = 20
    retry_delay = 10

    for attempt in range(retries):
        try:
            response = requests.get(link)

            if response.status_code == 200:
                file_bytes = response.content
                file_ext = link.split('.')[-1].lower()

                if file_ext in ['jpg', 'jpeg', 'png', 'jfif']:
                    pil_image = PIL.Image.open(BytesIO(file_bytes))
                    pil_image = pil_image.convert("L")
                    image_bytes = BytesIO()
                    pil_image.save(image_bytes, format="PNG") 
                    image_part = image_format(image_bytes.getvalue())
                    return await process_content(image_part)

                elif file_ext == 'pdf':
                    pdf_part = {
                        "mime_type": "application/pdf",
                        "data": file_bytes
                    }
                    return await process_content(pdf_part)
                else:
                    raise HTTPException(status_code=400, detail="Unsupported file format in invoice link")

            elif response.status_code == 429:
                print(f"Quota exceeded. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                raise HTTPException(status_code=400, detail="Failed to fetch the file from the provided link")

        except Exception as e:
            if attempt == retries - 1:
                raise HTTPException(status_code=400, detail=f"Error processing invoice link: {str(e)}")
            else:
                time.sleep(retry_delay)


# Renamed process_pdf to process_content and made it handle both PDF and image parts
async def process_content(content_part):
    system_prompt = """
        You are the best analyst specialist in comprehending for tax invoice bills. Analyze the Input images or pdf in the form of tax invoice will be provided to you, and your task is to respond to questions based on the content of the input image or pdf. Your top priority is to provide CONSISTENT results across multiple uploads of the SAME invoice. If uncertain, prioritize returning default values ("-", "0", "0%"). **When processing invoices, especially those with handwritten parts, prioritize visual clarity and verification over potentially unclear OCR output. Double-check handwritten alphanumeric codes like GSTINs and Invoice Numbers to ensure accuracy.**
        """

    prompt = """
        **Task:** Extract information from a "TAX INVOICE" document.

        **Objective:** Accurately extract specific fields from a tax invoice, with a CRITICAL focus on correctly identifying Supplier vs. Buyer information, especially GSTINs and handling handwritten text. **MAXIMUM CONSISTENCY IS REQUIRED.** The same invoice, when uploaded multiple times, MUST produce the EXACT SAME output. **For handwritten parts of the invoice, prioritize visual verification to ensure accurate data extraction, especially for critical alphanumeric codes.**

        **Understanding GSTIN (GST Number) and PAN:**

        Before extracting GSTINs, it's important to understand their format and relationship to PAN (Permanent Account Number):

        - 1. **GSTIN Structure:** A GSTIN (Goods and Services Tax Identification Number) is a 15-digit alphanumeric code.
        - 2. **GSTIN Example:**  For example: `08ABCDE9999F1Z8`
        - 3. **State Code (Digits 1-2):** The first two digits represent the state code. For example, `08` indicates Rajasthan.
        - 4. **PAN (Digits 3-12):** The next ten digits are the Permanent Account Number (PAN) of the business entity.
        - 5. **Entity Number (Digit 13):** The 13th digit indicates the number of businesses registered under the same PAN within a state.
        - 6. **Unique Suffix (Digits 14-15):** The last two digits are system-generated unique characters.

        + 1. **GSTIN Structure:** A GSTIN (Goods and Services Tax Identification Number) is a 15-digit alphanumeric code.
        + 2. **GSTIN Example:** For example: `27AAHCT8247N1Z1`
        + 3. **State Code (Digits 1-2):** The first two digits represent the Indian State Code (e.g., `27` for Maharashtra, `08` for Rajasthan, `33` for Tamil Nadu). Refer to a list of valid Indian State Codes if needed.
        + 4. **PAN (Digits 3-12):** The next ten digits are the PAN (Permanent Account Number) of the business entity. These ten characters *must be* alphanumeric.
        + 5. **Entity Code (Digit 13):** The 13th digit is an alphanumeric Entity Code (1-9, then A-Z), indicating the number of registrations the business has within the state under the same PAN.
        + 6.  **14th Character:** The 14th character is *always* the letter 'Z'.  **(This is a fixed part of the GSTIN format.)**
        + 7. **Checksum Digit (Digit 15):** The 15th digit is an alphanumeric checksum character used for validation. (No calculation required for extraction, but it *must* be alphanumeric).

        - 7. **PAN Format:** A PAN is a 10-digit alphanumeric identifier, typically in the format `ABCDE1234F`.  It will usually be present in the Supplier details section of the invoice, sometimes labeled as "PAN No." or "PAN".

        + 8. **PAN Format:** A PAN is a 10-digit alphanumeric identifier, typically in the format `ABCDE1234F`. It might be present in the Supplier details section of the invoice, sometimes labeled as "PAN No." or "PAN".

        **Instructions:**

        1.  **Document Type:** You are processing a "TAX INVOICE".
        2.  **Focus:**  Identify and extract information *only* from the designated sections of the invoice.
        3.  **Accuracy and Clarity for Handwritten Text:** Prioritize accuracy and exact matches, especially for alphanumeric codes like GSTIN and Invoice Numbers. **For handwritten parts, visually verify the characters to resolve ambiguity. Pay extra attention to digits that can be easily confused (e.g., 1, 7).** CONSISTENCY is MORE important than extracting a potentially incorrect value.
        4.  **Field Specificity:**  Follow the detailed instructions for each field below precisely. Do not extract information from irrelevant sections or infer values.
        5.  **Supplier vs. Buyer Distinction (CRITICAL):** Pay very close attention to differentiating between Supplier (Seller/Invoice Issuer) and Buyer (Customer/Bill To) information. This is especially important for GSTIN and Company Name.
        6.  **"Curefoods India Pvt Ltd" Handling:** If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" appears in the invoice, treat it as the **Buyer** and *never* as the **Supplier**.
        7.  **Default Values:** If a requested field cannot be *unambiguously* identified, ALWAYS return the specified default value (e.g., "-", "0", "0%").

        **Fields to Extract (with detailed instructions):**

        * **Supplier Company Name:**
            *   **Instruction:** Locate the legal business name of the invoice issuer. This is typically at the top of the invoice, often near a logo, and associated with the Supplier GSTIN and address. Identify the *Seller's* or *"Bill From"* or *"Sold By"* company name. **For handwritten company names, carefully verify the spelling and characters.**
            *   **Exclusion:** Do *not* extract the buyer's name or any other company name mentioned elsewhere on the invoice.
            *   **Buyer Check:**  If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" is found, it is the buyer, *not* the supplier.
            *   **Location Heuristic:** If no clear "Supplier Details", "Seller Details", or "Bill From" section is found, check the TOP-LEFT area of the invoice image/PDF. The Supplier Company Name is LIKELY to be located there.
            *   **Missing Value:** If still not found after checking the top-left, return "-".

        * **Supplier GSTIN:**
            *   **CRITICAL INSTRUCTION:** **Identify the GSTIN of the COMPANY ISSUING THIS INVOICE (the Seller/Supplier).  DO NOT, UNDER ANY CIRCUMSTANCES, EXTRACT THE BUYER'S GSTIN FOR THIS FIELD.**
            *   **FIRST: IDENTIFY THE SUPPLIER SECTION:** Before attempting to extract the GSTIN, **FIRST LOCATE THE SECTION OF THE INVOICE THAT CLEARLY CONTAINS SUPPLIER DETAILS (Company Name, Address, Logo).**  This is typically at the top of the invoice.
            *   **GSTIN Location (RESTATED):** The Supplier GSTIN *must* be found within the **SUPPLIER DETAILS SECTION** of the invoice, typically in the header area, near the Supplier Company Name and Address.
            *   **GSTIN Format Reminder:** Remember, a GSTIN is a 15-digit alphanumeric code like `08ABCDE9999F1Z8`. Look for values that match this pattern.
            *   **Location Keywords for Supplier Section:** Look for headings or labels like "Supplier Details," "Seller Details," "Invoice From," or simply the company logo and address at the **top of the document**. The Supplier GSTIN will be in *this* section.  If you cannot confidently identify this section, **STOP AND RETURN "-"**.
            *   **Keywords for GSTIN (Supplier Section):** Within the Supplier Section, look for labels such as "GSTIN," "GSTIN/UIN," "GST Number," or "GST No." The GSTIN value *immediately following* these keywords in the **Supplier Section** is the Supplier GSTIN.
            *   **Address Adjacency Rule:**  The Supplier GSTIN should be in the same visual block as the Supplier's Address. IF there are multiple addresses on the invoice, the correct Supplier GSTIN is the one CLOSEST to the supplier's registered address. If you find a GSTIN far away from the Supplier Address, it is likely incorrect.
            *   **Additional Check - PAN Proximity:** **After identifying a potential Supplier GSTIN, check if a PAN number (10-digit alphanumeric string like `ABCDE1234F`, potentially labeled "PAN No." or "PAN") is present *nearby* within the Supplier Details section.** The presence of a PAN number in close proximity strengthens the likelihood that you have correctly identified the Supplier GSTIN. Treat this as an additional confirmation step.
            *   **ABSOLUTE EXCLUSION (REPEATED):** **DO NOT EXTRACT ANY GSTIN FROM THE "BUYER," "BILL TO," or "CUSTOMER" SECTIONS as the Supplier GSTIN.** These are Buyer GSTINs and are *incorrect* for the Supplier GSTIN field. **IF YOU ARE UNSURE, RETURN "-"**.
            *   **"Curefoods India Pvt Ltd" Rule REINFORCED (AGAIN):** If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" is present, it is ALWAYS the **Buyer**. Ignore any GSTIN associated with these names when looking for the **Supplier GSTIN**.
            *   **DOUBLE CHECK:** **BEFORE RETURNING A GSTIN AS THE SUPPLIER GSTIN, VERIFY THAT IT IS IN THE SECTION OF THE DOCUMENT CONTAINING THE SUPPLIER'S NAME AND ADDRESS, AND NOT THE BUYER'S.**
            *   **Address as Disambiguation:** If there is more than one company name at the top of the invoice, use the address to disambiguate. Extract the GSTIN associated with the company whose address is also at the top of the invoice.
            *   **Format:** Ensure the extracted value is in the alphanumeric GSTIN format.
                *   **GSTIN Validation:** **After extracting a *potential* Supplier GSTIN, perform the following VALIDATION CHECKS to confirm its validity:**
                    *   **Length Check:** The GSTIN *must* be exactly 15 characters long. If it is not, it is *invalid*.
                    *   **State Code Check:** The first two digits *must* be numeric (0-9).  **(Example State Codes: 01-Jammu & Kashmir, 03-Punjab, 07-Delhi, 27-Maharashtra, 33-Tamil Nadu, etc.).** While a full state code validation is not required, ensure the first two characters are *numerals* representing a plausible state code.
                    *   **PAN Check:** Characters 3 to 12 (next 10 characters) *must* be alphanumeric (digits 0-9 and uppercase letters A-Z).
                    *   **Entity Code Check:** The 13th character *must* be alphanumeric (digits 1-9 or uppercase letters A-Z).
                    *   **14th Character Check:** The 14th character *must* be the uppercase letter 'Z'.  **If the 14th character is *anything* other than 'Z' (including any number), the GSTIN is immediately invalid. Return "-".**
                    *   **Checksum Digit:** The 15th character is a checksum (alphanumeric). While checksum calculation is not required, ensure it is alphanumeric.
                    *   **Validation Failure:** **If the potential Supplier GSTIN FAILS *any* of these validation checks, it is *incorrect*. In case of validation failure, IMMEDIATELY DISREGARD the extracted GSTIN and return "-" for Supplier GSTIN. DO NOT attempt to "correct" an invalid GSTIN. A GSTIN that fails validation is *wrong* and should be treated as missing data ("-").**
            *   **Conflict Resolution Rule (GSTIN):** IF the potential Supplier GSTIN is associated with a company name that is clearly identified as the BUYER (e.g., "Curefoods..."), DISREGARD that GSTIN and search for another one within the Supplier Details section. If NO other GSTIN is found, return "-".
            *   **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Supplier GSTIN CANNOT be *unambiguously* and confidently identified within the **SUPPLIER DETAILS SECTION**, RETURN "-". Do NOT attempt to guess or extract from other sections.
            *   **CONSISTENCY RULE:** Strive for maximum consistency. The same Supplier GSTIN MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "-".
            *       **NEGATIVE CONSTRAINT (SUPPLIER/BUYER SWAP - GSTIN):** The Supplier GSTIN must NOT be '06AAJCC0315JIZY'. This is the Buyer's GSTIN. If this is the ONLY GSTIN found in the Supplier section, return "-".
            *   **Missing Value:** If you **cannot unambiguously** identify a GSTIN within the **SUPPLIER DETAILS SECTION**, return "-". Do not guess or extract from other sections. **PRIORITIZE RETURNING "-" OVER A POTENTIALLY INCORRECT VALUE.**

            
        * **Address:**
            * **Instruction:** Extract the complete supplier address block, including all lines. **For handwritten addresses, carefully verify the spelling and characters.**
            * **Address Boundary:**  The supplier address block consists ONLY of the lines containing the street address, locality, city, and postal code.  It MUST NOT include phone numbers, email addresses, website URLs, or any other information that is NOT part of the physical mailing address.
            * **Address Line Separator:** Return each line of the address on a new line, preserving the original line breaks from the invoice.
            * **Exclusion:** Do not include buyer address details.
            * **Address Consistency:** Ensure the extracted address is CONSISTENT with the Supplier Company Name. If the address appears to belong to a different company, return "-" for the Address.
            * **Address Table Exclusion:** The supplier address must end BEFORE the start of any table or grid of data on the invoice. Look for clear visual delimiters like column headings or line separations that indicate the start of a table. If the address runs into a table, truncate it BEFORE the table begins.
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Address CANNOT be *unambiguously* and confidently identified, RETURN "-".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same Address MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "-".

        * **Invoice No:**
            * **Instruction:** Locate the alphanumeric invoice identifier, typically near the "Invoice Date" and labeled "Invoice No.", "Invoice Number", **or "Bill No."**. **For handwritten invoice numbers, and *even for printed ones*, carefully and visually verify *each and every* character, especially alphanumeric combinations.  It is *absolutely, critically, and non-negotiably crucial* to capture the *ENTIRE, COMPLETE, and UNTRUNCATED* Invoice Number, including *each and every single* digit and letter in the sequence, from the *very first character to the very last character*, without missing anything.  Do *not* truncate, shorten, omit, or miss *any single part, digit, letter, or character* of the Invoice Number in *any way whatsoever*.  The extracted Invoice Number *must be* a *100% exact, complete, and visually identical* match to what is presented on the invoice.  **YOU *MUST* EXTRACT THE *ENTIRE* BILL NO./INVOICE NO. - DO *NOT* STOP SHORT!**
            * **Negative Constraint - *Do Not Truncate*:** **UNDER *NO CIRCUMSTANCES* ARE YOU ALLOWED TO TRUNCATE, SHORTEN, OR OMIT *ANY* CHARACTER from the Invoice Number/Bill No.  You *MUST* extract the *full, complete, and visually presented* Invoice/Bill Number.  If you find a sequence that *looks like* an Invoice Number or Bill No., you *MUST* visually check to the right and *ensure you have captured *all* the characters* that are part of that identifier. **DO *NOT* ASSUME YOU HAVE REACHED THE END OF THE INVOICE NUMBER PREMATURELY!**  **ALWAYS CHECK FOR MORE CHARACTERS TO THE RIGHT OF A POTENTIAL INVOICE/BILL NUMBER!**
            * **Format:** Capture the **exact and complete** alphanumeric sequence. **Ensure you extract *all* characters of the Invoice Number, from the *absolute very beginning* to the *absolute very end*.  Double-check *multiple, multiple, multiple times* - visually, character by character - that no characters are missed, truncated, or incorrectly identified.  The *entire* Invoice Number, as it *fully and completely* appears on the invoice image, is *absolutely required*.  **VISUAL COMPLETENESS AND EXACTNESS ARE PARAMOUNT FOR INVOICE NUMBERS.**
        
        * **Invoice Date:**
            * **Instruction:** Extract the invoice issue date. **For handwritten dates, ensure the date components (day, month, year) are correctly identified.**
            * **Format:** Convert to [DAY/MONTH/YEAR] format (e.g., 01/12/2024).

        * **Supplier State Code:**
            * **CRITICAL INSTRUCTION:**  **You MUST ONLY extract the Supplier State Code from a field explicitly labeled "State Code". This field will contain ONLY the two-digit numeric state code.**
            * **SOURCE RESTRICTION: ABSOLUTELY DO NOT attempt to infer the state code from the Supplier's Address, GSTIN, or any other field.  The ONLY valid source is a field labeled "State Code".**
            * **EXAMPLE OF CORRECT FIELD:** Look for a field that looks exactly like:  `State Code: 27`
            * **EXAMPLE OF INCORRECT INFERENCE:** Do NOT try to use the first two digits of the GSTIN to determine the state code. This is incorrect and WILL result in errors.
            * **NEGATIVE CONSTRAINT:  If you cannot find a field labeled "State Code", you MUST return "-" for this field.  There are NO EXCEPTIONS to this rule.**
            * **Missing Value:** If "State Code" field is missing or empty, return "-".

        * **Buyer Company Name:**
            * **Instruction:** Locate the company name the invoice is billed to, typically under headings like "Bill to:", "Billed to:", "Consignee to:", or "Shipped to:". **For handwritten buyer names, carefully verify the spelling and characters.**
            * **Curefoods Priority Check:** IF any variation of "Curefoods India Pvt Ltd" (including "MA CURE FOOD INDIA" or "M/s CURE FOOD INDIA" or "CAKEZONE FOODTECH PVT LTD") is found ABOVE a potential "Bill to," "Billed to," etc. section, THEN that "Curefoods" name MUST be extracted as the Buyer Company Name, regardless of other company names present.
            * **Fallback:** If the "Bill to," "Billed to," etc. headings AND a "Curefoods" name above are NOT found, then the buyer company name will be the company name that is NOT the Supplier company.
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Buyer Company Name CANNOT be *unambiguously* and confidently identified, RETURN "-".
            * **CONSISTENCY RULE:** Strive for maximum consistency. The same Buyer Company Name MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "-".
            *    **NEGATIVE CONSTRAINT (BUYER IS NOT SUPPLIER):** UNDER NO CIRCUMSTANCES should the Buyer Company Name be the same as the extracted Supplier Company Name. If they appear to be the same, return "-" for the Buyer Company Name.
            * **Missing Value:** If not found, return "-".

        * **Buyer State:**
            * **Instruction:** Extract the full name of the buyer's state, usually in the "BILL TO" section or in the buyer's address. **For handwritten states, verify the spelling.**
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Buyer State CANNOT be *unambiguously* and confidently identified, RETURN "-".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same Buyer State MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "-".
            * **Missing Value:** If not found, return "-".

        * **Buyer GSTIN:**
            * **CRITICAL INSTRUCTION:** **Identify the GSTIN of the COMPANY BEING BILLED (the Buyer/Customer).** This GSTIN *must* be found within the **BUYER DETAILS SECTION** of the invoice OR **DIRECTLY ASSOCIATED WITH THE BUYER COMPANY NAME IF A DEDICATED BUYER SECTION IS NOT CLEARLY LABELED.**
            * **GSTIN Format Reminder:** Remember, a GSTIN is a 15-digit alphanumeric code like `08ABCDE9999F1Z8`. Look for values that match this pattern.
            - **Location Keywords for Buyer Section:** Look for headings or labels like "Buyer Details," "Bill To," "Billed To," "Customer Details," "Consignee Details," or "Shipped To." If these sections are NOT present, the Buyer GSTIN is the GSTIN associated with the Buyer Company Name.
            + **Location for Buyer GSTIN:** **FIRST, check for clearly labeled Buyer Sections** using headings or labels like "Buyer Details," "Bill To," "Billed To," "Customer Details," "Consignee Details," or "Shipped To." **If these sections are NOT found, then look for the GSTIN that is DIRECTLY ASSOCIATED with the Buyer Company Name.**  This association is often indicated by the GSTIN being immediately below or very close to the Buyer Company Name and potentially labeled with "GSTIN," "GST No.", etc.
            * **Keywords for GSTIN (Buyer Section):** Within the Buyer Section OR near the Buyer Company Name, look for labels such as "GSTIN," "GSTIN/UIN," "GST Number," or "GST No." The GSTIN value *immediately following* these keywords in the **Buyer Section or near the Buyer Company Name** is the Buyer GSTIN.
            * **ABSOLUTE EXCLUSION:** **DO NOT EXTRACT ANY GSTIN FROM THE "SUPPLIER," "SELLER," or "BILL FROM" SECTIONS as the Buyer GSTIN.** These are Supplier GSTINs and are *incorrect* for the Buyer GSTIN field.
            * **Scope:** Extract *only* the alphanumeric GSTIN value, excluding labels like "GSTIN:". Ensure it's in GSTIN format.
            *   **NEGATIVE CONSTRAINT (AVOID SUPPLIER GSTIN):** The Buyer GSTIN must NOT be the same as the extracted Supplier GSTIN. If they appear to be the same, return "-" for the Buyer GSTIN.
            -  **Handwritten GSTIN Verification (BUYER):** **Because the Buyer GSTIN appears to be handwritten in this invoice, it is CRITICAL to VISUALLY VERIFY EACH CHARACTER of the extracted GSTIN. Pay very close attention to distinguishing between similar-looking digits (like '1' and '7', '2' and 'Z', '5' and 'S', '6' and 'G', '8' and 'B', '9' and 'g'). If ANY character is unclear or ambiguous after visual inspection, or if there's even a slight doubt about accuracy, IMMEDIATELY RETURN "-" for the Buyer GSTIN.  Accuracy for GSTINs is paramount.**
            +  **Handwritten GSTIN Verification (BUYER) - **EXTREMELY IMPORTANT**: **Because the Buyer GSTIN is HANDWRITTEN in this invoice, it is absolutely critical to perform a detailed VISUAL VERIFICATION of each character.  OCR is known to be unreliable with handwriting, so you CANNOT rely on the OCR output alone. You MUST look at the image of the handwritten GSTIN and CAREFULLY VERIFY EACH character.**
            + **Specifically, pay EXTREME attention to distinguishing between digits and letters that look similar in handwriting.  Specifically, be very careful to differentiate between:**
                +  `'1'` (digit one) and `'I'` (uppercase I), `'l'` (lowercase l) - *User reported '1' captured as 'I'*
                +  `'0'` (digit zero) and `'O'` (uppercase O), `'D'` (uppercase D) - *User reported '0' captured as 'O'*
                +  `'2'` (digit two) and `'Z'` (uppercase Z) - *User reported '2' captured as 'Z'*
                +  `'5'` (digit five) and `'S'` (uppercase S)
                +  `'6'` (digit six) and `'G'` (uppercase G)
                +  `'8'` (digit eight) and `'B'` (uppercase B)
                +  `'9'` (digit nine) and `'g'` (lowercase g), `'q'` (lowercase q)
                +  `'7'` (digit seven) and `'T'` (uppercase T) (less common, but possible)
            + **When you see a character that could be interpreted as either a digit OR a letter, THINK LIKE A HUMAN would when carefully reading handwriting.** Consider the context (GSTIN format, expected characters - GSTINs use digits 0-9 and uppercase letters A-Z, no lowercase).  If it's *more likely* to be a digit in the context of a GSTIN, interpret it as the digit. If it's truly ambiguous even to a careful human reader, err on the side of caution.
            + **If, after this detailed visual verification and careful consideration, ANY character remains unclear, or if there is ANY doubt about whether it is a digit or a letter, or about its identity at all, then IMMEDIATELY and without hesitation, RETURN "-" for the Buyer GSTIN.**  It is far better to return "-" (missing) than to return an incorrect GSTIN due to misinterpreting handwriting.
            + **Remember, accuracy for GSTINs is absolutely paramount. Consistency and correctness are more important than attempting to extract a value if it's ambiguous in handwritten form.**
            *   **GSTIN Validation:** **After extracting a *potential* Buyer GSTIN, perform the following VALIDATION CHECKS to confirm its validity:**
                *   **Length Check:** The GSTIN *must* be exactly 15 characters long. If it is not, it is *invalid*.
                *   **State Code Check:** The first two digits *must* be numeric (0-9).  **(Example State Codes: 01-Jammu & Kashmir, 03-Punjab, 07-Delhi, 27-Maharashtra, 33-Tamil Nadu, etc.).** While a full state code validation is not required, ensure the first two characters are *numerals* representing a plausible state code.
                *   **PAN Check:** Characters 3 to 12 (next 10 characters) *must* be alphanumeric (digits 0-9 and uppercase letters A-Z).
                *   **Entity Code Check:** The 13th character *must* be alphanumeric (digits 1-9 or uppercase letters A-Z).
                *   **14th Character Check:** The 14th character *must* be the uppercase letter 'Z'.  **If the 14th character is *anything* other than 'Z' (including any number), the GSTIN is immediately invalid. Return "-".**
                *   **Checksum Digit:** The 15th character is a checksum (alphanumeric). While checksum calculation is not required, ensure it is alphanumeric.
                *   **Validation Failure:** **If the potential Buyer GSTIN FAILS *any* of these validation checks, it is *incorrect*. In case of validation failure, IMMEDIATELY DISREGARD the extracted GSTIN and return "-" for Buyer GSTIN. DO NOT attempt to "correct" an invalid GSTIN. A GSTIN that fails validation is *wrong* and should be treated as missing data ("-").**
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Buyer GSTIN CANNOT be *unambiguously* and confidently identified within the BUYER DETAILS SECTION **OR ASSOCIATED WITH THE BUYER COMPANY NAME**, RETURN "-".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same Buyer GSTIN MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "-".
            * **Missing Value:** If a valid Buyer GSTIN **cannot be unambiguously** identified within the **BUYER DETAILS SECTION OR ASSOCIATED WITH THE BUYER COMPANY NAME**, return "-". Do not guess or extract from other sections.


        * **Taxable Value:**
            * **Instruction:** Extract the total taxable value before taxes. If the "Taxable Value" is explicitly mentioned, use that value. If it is not explicitly mentioned, return only "0" and do not include the total invoice amount in the taxable value.
        * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Taxable Value CANNOT be *unambiguously* and confidently identified, RETURN "0".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same Taxable Value MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "0".
            * **Missing Value:** If neither "Taxable Value" are not found, return "0"..


        * **Tax Rate:**
            * **Instruction:**  Extract all applicable tax rates.
                * **Priority:** Prioritize CGST or SGST rates.
                * **Same CGST/SGST: If both CGST and SGST are present and identical for a given item, return that single rate for that item.
                * **Different CGST/SGST: If CGST and SGST are present for a given item, but the rates are different, return both rates for that item, separated by a comma and space (e.g., "6%, 2.5%").
                * **IGST: If only IGST or "Integrated Tax" is present for a given item, return that rate for that item.
                * **Multiple Rates: If different items have different rates, report all rates found on the invoice, separated by a comma and space (e.g., "6%, 2.5%").
            * **Format:** Output as percentage value(s) (e.g., "5%", or "5%, 2.5%").
                * **Unambiguous Identification MANDATORY:**  If, after applying all the above steps, a tax rate for a line item cannot be unambiguously and confidently identified, represent that line item with "0%". If no tax rates can be identified on the entire invoice, return "0%".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same tax rates must be extracted every time the same invoice is uploaded. If you are ever uncertain about a specific line item's tax rate, use "0%" for that line item.
            * **Inconsistent/Missing Rates for a Line Item:** If rates are missing or inconsistent across CGST, SGST, IGST, and "Integrated Tax" for a single line item, use "0%" for that line item. If there's no tax breakdown for a line item (e.g., just a total, with no CGST, SGST), default the rate for that line item to "0%".


        * **CGST:**
            * **Instruction:** Extract the CGST amount.
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the CGST CANNOT be *unambiguously* and confidently identified, RETURN "0".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same CGST MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "0".
            * **Missing Value:** If not found, return "0".

        * **SGST:**
            * **Instruction:** Extract the SGST amount.
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the SGST CANNOT be *unambiguously* and confidently identified, RETURN "0".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same SGST MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "0".
            * **Missing Value:** If not found, return "0".

        * **IGST:**
            * **Instruction:** Extract the IGST amount.
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the IGST CANNOT be *unambiguously* and confidently identified, RETURN "0".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same IGST MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "0".
            * **Missing Value:** If not found, return "0".

        * **Discount:**
            * **Instruction:** Extract any explicitly mentioned discount.
            * **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Discount CANNOT be *unambiguously* and confidently identified, RETURN "0".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same Discount MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "0".
            * **Missing Value:** If none found, return "0".

        * **Total Amount:**
            * **Instruction:** Total Amount of invoice bill, including taxes.
            *   **Unambiguous Identification MANDATORY:** IF, after applying ALL the above steps, the Total Amount CANNOT be *unambiguously* and confidently identified, RETURN "0".
            *  **CONSISTENCY RULE:** Strive for maximum consistency. The same Total Amount MUST be extracted every time the same invoice is uploaded. If you are ever uncertain, RETURN "0".
            * **Missing Value:** If missing, return "0".
            * **Double-Check with Amount in Words:**
                * **Check for Words:** Look for fields like "Amount Chargeable (in words)" or "Total Amount in (words)" that represent the total amount in words.
                * **Verify Consistency:** If an "Amount in words" field is found, **compare the numerical Total Amount with the amount expressed in words.**
                * **Prioritize Numerical Field:** The primary source for the Total Amount is still the numerical "Total Amount" field.  Use the "Amount in words" field as a *verification* step.
                * **Discrepancy Handling:** **If there is a clear and unresolvable discrepancy between the numerical Total Amount and the "Amount in words", OR if either field is ambiguous or unclear, then return "0" for the Total Amount.**  In cases of doubt, accuracy and consistency are paramount.
                * **No Words Field:** If no "Amount in words" field is found, extract the numerical Total Amount as per the original instructions, and do not return "0" simply because a words field is missing. The words field is for *verification*, not mandatory extraction.
                * **Focus on Primary Field:** Always prioritize the accurate extraction of the numerical "Total Amount" field first.  Only use the "Amount in words" for a secondary verification check when available.

        **Output Format:**

        Return the extracted information in the following JSON format:

        ```json
        {
            "Supplier Company Name": "<supplier_company_name>",
            "Supplier GSTIN": "<supplier_gstin>",
            "Address": "<address>",
            "Invoice No": "<invoice_no>",
            "Invoice Date": "<invoice_date>",
            "Supplier State Code": "<supplier_state_code>",
            "Buyer Company Name": "<buyer_company_name>",
            "Buyer State": "<buyer_state>",
            "Buyer GST": "<buyer_gst>",
            "Taxable Value": "<taxable_value>",
            "Tax Rate": "<tax_rate>",
            "CGST": "<cgst>",
            "SGST": "<sgst>",
            "IGST": "<igst>",
            "Discount": "<discount>",
            "Total Amount": "<total_amount>"
        }
        ```
        """

    try:
        response = await model.generate_content_async([system_prompt,prompt, content_part])
        raw_output = response.text
    except Exception as e:
        return {}

    start_index = raw_output.find("{")
    end_index = raw_output.rfind("}")

    if start_index == -1 or end_index == -1:
       return {}

    json_content = raw_output[start_index:end_index + 1]

    try:
        parsed_data = json.loads(json_content)
    except json.JSONDecodeError as e:
        return {}

    expected_fields = [
        "Supplier Company Name", "Supplier GSTIN", "Address", "Invoice No", "Invoice Date",
        "Supplier State Code","Buyer Company Name", "Buyer State", "Buyer GST", "Taxable Value",
        "Tax Rate", "CGST", "SGST", "IGST", "Discount", "Total Amount"
    ]
    for field in expected_fields:
        if field not in parsed_data or parsed_data[field] is None:
            parsed_data[field] = "-"

    # Address Cleaning
    if "Address" in parsed_data and parsed_data["Address"] != "-":
        address_parts = parsed_data["Address"].split()
        parsed_data["Address"] = " ".join(address_parts).replace(',',',').replace("\n", ", ").replace(",,", ", ")

    # Rate Cleaning
    if "Tax Rate" in parsed_data:
        if not parsed_data["Tax Rate"].endswith("%"):
            parsed_data["Tax Rate"] = "0%"

    # Discount Cleaning
    if "Discount" in parsed_data:
        if parsed_data["Discount"] == "NILL" or parsed_data["Discount"] == "":
            parsed_data["Discount"] = "0"

    # Buyer State Validation
    if "Buyer State" in parsed_data and parsed_data["Buyer State"] != "-":
        buyer_state = parsed_data["Buyer State"].strip()

        # Check if the Buyer State contains any digits
        if any(char.isdigit() for char in buyer_state):
            parsed_data["Buyer State"] = "-"
        else:
            valid_states = [
                "Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chhattisgarh", "Goa",
                "Gujarat", "Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka", "Kerala",
                "Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram", "Nagaland",
                "Odisha", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana", "Tripura",
                "Uttar Pradesh", "Uttarakhand", "West Bengal"
            ]

            # Ensure only valid state names are allowed
            if buyer_state not in valid_states:
                parsed_data["Buyer State"] = "-"

    # Supplier Company Name Cleaning
    if "Supplier Company Name" in parsed_data and parsed_data["Supplier Company Name"]:
        if parsed_data.get("Supplier Company Name", "").strip().lower().startswith("curefoods"):
            parsed_data["Supplier Company Name"] = "-"
        parsed_data["Supplier Company Name"] = parsed_data["Supplier Company Name"].replace("\n", " ").strip()

    if "Buyer GST" in parsed_data and parsed_data["Buyer GST"] != "-":
        buyer_gst = parsed_data["Buyer GST"].strip()
        buyer_gst = buyer_gst.replace("GSTIN :", "").replace("GSTIN:", "").replace("GST No.", "").strip() 
        if not buyer_gst.isalnum() or len(buyer_gst) != 15: 
            parsed_data["Buyer GST"] = "-"
        else:
            parsed_data["Buyer GST"] = buyer_gst

    return parsed_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant