You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import google.generativeai as genai
import PIL.Image
from io import BytesIO
import json
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
from urllib.parse import urlparse
import time
app = FastAPI()
# token = "AIzaSyBW9ot7RVqj2jWgnIncwVC1V8yfXW7BSsc
# token = "AIzaSyBLnWtYXmw9stMlymaxO4J_ZxhePBm-uMw"
token ="AIzaSyCooh3K9Y1xCpr-YeN0CUJ_8eHvdxqnWOg"
genai.configure(api_key=token)
# Gemini 1.5 Pro and 1.5 Flash support direct PDF processing up to 3,600 pages and images.
# They accept PDF files with MIME type 'application/pdf' and images with 'image/png', 'image/jpeg', etc.
model = genai.GenerativeModel("gemini-1.5-flash")
# model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-1219")
ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.pdf', '.jfif', '.csv', '.xlsx']
def is_allowed_file(filename: str) -> bool:
return any(filename.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS)
def is_valid_url(link: str) -> bool:
if link == "-" or not link:
return False
parsed_url = urlparse(link)
if parsed_url.scheme and parsed_url.netloc:
return any(link.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS)
return False
# Function to format the image for input to the Gemini model
def image_format(image_bytes):
try:
image_parts = { # Changed to return a dict instead of list
"mime_type": "image/png", # Supported mime types: PNG, JPEG, WEBP
"data": image_bytes
}
return image_parts
except Exception as e:
raise Exception(f"Error formatting image: {e}")
@app.post("/extract-invoice")
async def extract_invoice(file: UploadFile = File(...)):
filename = file.filename
if not is_allowed_file(filename):
raise HTTPException(
status_code=400,
detail="Invalid file type. Allowed formats are: .jpg, .jpeg, .png, .pdf, .jfif, .csv, .xlsx"
)
try:
file_bytes = await file.read()
if filename.lower().endswith(('.csv', '.xlsx')):
if filename.lower().endswith('.csv'):
df = pd.read_csv(BytesIO(file_bytes))
elif filename.lower().endswith('.xlsx'):
df = pd.read_excel(BytesIO(file_bytes))
if "Invoice link" not in df.columns:
raise HTTPException(status_code=400, detail="Invoice link column not found")
extracted_data = []
extracted_count = 0
non_extracted_count = 0
for idx, link in enumerate(df['Invoice link'], 1):
if not is_valid_url(link):
non_extracted_count+=1
print(f"Skipping invalid link: {link}")
continue
data = await process_invoice_link(link)
extracted_data.append(data)
extracted_count += 1
print(f"Extracted {idx} invoice(s) from the provided link.")
return JSONResponse(content={"extracted_data": extracted_data})
elif filename.lower().endswith('.pdf'):
pdf_part = {
"mime_type": "application/pdf",
"data": file_bytes
}
extracted_data = await process_content(pdf_part)
else: # Handling image files (jpg, jpeg, png, jfif)
pil_image = PIL.Image.open(BytesIO(file_bytes))
pil_image = pil_image.convert("L")
image_bytes = BytesIO()
pil_image.save(image_bytes, format="PNG") # Convert image to bytes
image_part = image_format(image_bytes.getvalue())
extracted_data = await process_content(image_part)
return JSONResponse(content={"extracted_data": extracted_data})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
async def process_invoice_link(link: str):
retries = 20
retry_delay = 10
for attempt in range(retries):
try:
response = requests.get(link)
if response.status_code == 200:
file_bytes = response.content
file_ext = link.split('.')[-1].lower()
if file_ext in ['jpg', 'jpeg', 'png', 'jfif']:
pil_image = PIL.Image.open(BytesIO(file_bytes))
pil_image = pil_image.convert("L")
image_bytes = BytesIO()
pil_image.save(image_bytes, format="PNG") # Convert image to bytes
image_part = image_format(image_bytes.getvalue())
return await process_content(image_part)
elif file_ext == 'pdf':
pdf_part = {
"mime_type": "application/pdf",
"data": file_bytes
}
return await process_content(pdf_part)
else:
raise HTTPException(status_code=400, detail="Unsupported file format in invoice link")
elif response.status_code == 429:
print(f"Quota exceeded. Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
raise HTTPException(status_code=400, detail="Failed to fetch the file from the provided link")
except Exception as e:
if attempt == retries - 1:
raise HTTPException(status_code=400, detail=f"Error processing invoice link: {str(e)}")
else:
time.sleep(retry_delay)
# Renamed process_pdf to process_content and made it handle both PDF and image parts
async def process_content(content_part):
system_prompt = """
You are the best analyst specialist in comprehending for tax invoice bills. Analyze the Input images or pdf in the form of tax invoice will be provided to you, and your task is to respond to questions based on the content of the input image or pdf.
"""
prompt = """
**Task:** Extract information from a "TAX INVOICE" document.
**Objective:** Accurately extract specific fields from a tax invoice, with a CRITICAL focus on correctly identifying Supplier vs. Buyer information, especially GSTINs.
**Understanding GSTIN (GST Number) and PAN:**
Before extracting GSTINs, it's important to understand their format and relationship to PAN (Permanent Account Number):
1. **GSTIN Structure:** A GSTIN (Goods and Services Tax Identification Number) is a 15-digit alphanumeric code.
2. **GSTIN Example:** For example: `08ABCDE9999F1Z8`
3. **State Code (Digits 1-2):** The first two digits represent the state code. For example, `08` indicates Rajasthan.
4. **PAN (Digits 3-12):** The next ten digits are the Permanent Account Number (PAN) of the business entity.
5. **Entity Number (Digit 13):** The 13th digit indicates the number of businesses registered under the same PAN within a state.
6. **Unique Suffix (Digits 14-15):** The last two digits are system-generated unique characters.
7. **PAN Format:** A PAN is a 10-digit alphanumeric identifier, typically in the format `ABCDE1234F`. It will usually be present in the Supplier details section of the invoice, sometimes labeled as "PAN No." or "PAN".
**Instructions:**
1. **Document Type:** You are processing a "TAX INVOICE".
2. **Focus:** Identify and extract information *only* from the designated sections of the invoice.
3. **Accuracy:** Prioritize accuracy and exact matches, especially for alphanumeric codes like GSTIN and Invoice Numbers.
4. **Field Specificity:** Follow the detailed instructions for each field below precisely. Do not extract information from irrelevant sections or infer values.
5. **Supplier vs. Buyer Distinction (CRITICAL):** Pay very close attention to differentiating between Supplier (Seller/Invoice Issuer) and Buyer (Customer/Bill To) information. This is especially important for GSTIN and Company Name.
6. **"Curefoods India Pvt Ltd" Handling:** If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" appears in the invoice, treat it as the **Buyer** and *never* as the **Supplier**.
7. **Default Values:** If a requested field cannot be unambiguously identified, return the specified default value (e.g., "-", "0", "0%").
**Fields to Extract (with detailed instructions):**
* **Supplier Company Name:**
* **Instruction:** Locate the legal business name of the invoice issuer. This is typically at the top of the invoice, often near a logo, in the header, and associated with the Supplier GSTIN and address. Identify the *seller's* or *"Bill From"* company name.
* **Exclusion:** Do *not* extract the buyer's name or any other company name mentioned elsewhere on the invoice.
* **Buyer Check:** If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" is found, it is the buyer, *not* the supplier.
* **Supplier GSTIN:**
* **CRITICAL INSTRUCTION:** **Identify the GSTIN of the COMPANY ISSUING THIS INVOICE (the Seller/Supplier).** This GSTIN *must* be found within the **SUPPLIER DETAILS SECTION** of the invoice, typically in the header area, near the Supplier Company Name and Address.
* **GSTIN Format Reminder:** Remember, a GSTIN is a 15-digit alphanumeric code like `08ABCDE9999F1Z8`. Look for values that match this pattern.
* **Location Keywords for Supplier Section:** Look for headings or labels like "Supplier Details," "Seller Details," "Invoice From," or simply the company logo and address at the **top of the document**. The Supplier GSTIN will be in *this* section.
* **Keywords for GSTIN (Supplier Section):** Within the Supplier Section, look for labels such as "GSTIN," "GSTIN/UIN," "GST Number," or "GST No." The GSTIN value *immediately following* these keywords in the **Supplier Section** is the Supplier GSTIN.
* **Additional Check - PAN Proximity:** **After identifying a potential Supplier GSTIN, check if a PAN number (10-digit alphanumeric string like `ABCDE1234F`, potentially labeled "PAN No." or "PAN") is present *nearby* within the Supplier Details section.** The presence of a PAN number in close proximity strengthens the likelihood that you have correctly identified the Supplier GSTIN. Treat this as an additional confirmation step.
* **ABSOLUTE EXCLUSION:** **DO NOT EXTRACT ANY GSTIN FROM THE "BUYER," "BILL TO," or "CUSTOMER" SECTIONS as the Supplier GSTIN.** These are Buyer GSTINs and are *incorrect* for the Supplier GSTIN field.
* **"Curefoods India Pvt Ltd" Rule REINFORCED:** If "Curefoods India Pvt Ltd", "MA CURE FOOD INDIA", or "M/s CURE FOOD INDIA" is present, it is ALWAYS the **Buyer**. Ignore any GSTIN associated with these names when looking for the **Supplier GSTIN**.
* **Format:** Ensure the extracted value is in the alphanumeric GSTIN format.
* **Missing Value:** If you **cannot unambiguously** identify a GSTIN within the **SUPPLIER DETAILS SECTION**, return "-". Do not guess or extract from other sections.
* **Address:**
* **Instruction:** Extract the complete supplier address block, including all lines.
* **Exclusion:** Do not include buyer address details.
* **Invoice No:**
* **Instruction:** Locate the alphanumeric invoice identifier, typically near the "Invoice Date" and labeled "Invoice No.", "Invoice Number", etc.
* **Format:** Capture the exact alphanumeric sequence.
* **Invoice Date:**
* **Instruction:** Extract the invoice issue date.
* **Format:** Convert to [DAY/MONTH/YEAR] format (e.g., [01/12/2024]).
* **Supplier State Code:**
* **Instruction:** Extract the numeric state code *only* from the explicit "State Code" field.
* **Source:** Do *not* infer or fetch from the address.
* **Missing Value:** If "State Code" field is missing or empty, return "-".
* **Buyer Company Name:**
* **Instruction:** Locate the company name the invoice is billed to, typically under headings like "Bill to:", "Billed to:", "Consignee to:", or "Shipped to:".
* **Missing Value:** If not found, return "-".
* **Buyer State:**
* **Instruction:** Extract the full name of the buyer's state, usually in the "BILL TO" section.
* **Missing Value:** If not found, return "-".
* **Buyer GSTIN:**
* **CRITICAL INSTRUCTION:** **Identify the GSTIN of the COMPANY BEING BILLED (the Buyer/Customer).** This GSTIN *must* be found within the **BUYER DETAILS SECTION** of the invoice.
* **GSTIN Format Reminder:** Remember, a GSTIN is a 15-digit alphanumeric code like `08ABCDE9999F1Z8`. Look for values that match this pattern.
* **Location Keywords for Buyer Section:** Look for headings or labels like "Buyer Details," "Bill To," "Billed To," "Customer Details," "Consignee Details," or "Shipped To." The Buyer GSTIN will be in *this* section.
* **Keywords for GSTIN (Buyer Section):** Within the Buyer Section, look for labels such as "GSTIN," "GSTIN/UIN," "GST Number," or "GST No." The GSTIN value *immediately following* these keywords in the **Buyer Section** is the Buyer GSTIN.
* **ABSOLUTE EXCLUSION:** **DO NOT EXTRACT ANY GSTIN FROM THE "SUPPLIER," "SELLER," or "BILL FROM" SECTIONS as the Buyer GSTIN.** These are Supplier GSTINs and are *incorrect* for the Buyer GSTIN field.
* **Scope:** Extract *only* the alphanumeric GSTIN value, excluding labels like "GSTIN:". Ensure it's in GSTIN format.
* **Missing Value:** If a valid Buyer GSTIN **cannot be unambiguously** identified within the **BUYER DETAILS SECTION**, return "-". Do not guess or extract from other sections.
* **Taxable Value:**
* **Instruction:** Extract the total taxable value before taxes.
* **Missing Value:** If not explicitly mentioned, return "0".
* **Tax Rate:**
* **Instruction:** Extract the applicable tax rate. Prioritize CGST or SGST rates. If both CGST and SGST are present and identical, return that rate. If not found or different, check for IGST or "Integrated Tax".
* **Format:** Output as a percentage value (e.g., "5%").
* **Inconsistent/Missing Rates:** If rates are missing or inconsistent across CGST, SGST, IGST, and "Integrated Tax", return "0%".
* **CGST:**
* **Instruction:** Extract the CGST amount.
* **Missing Value:** If not found, return "0".
* **SGST:**
* **Instruction:** Extract the SGST amount.
* **Missing Value:** If not found, return "0".
* **IGST:**
* **Instruction:** Extract the IGST amount.
* **Missing Value:** If not found, return "0".
* **Discount:**
* **Instruction:** Extract any explicitly mentioned discount.
* **Missing Value:** If none found, return "0".
* **Total Amount:**
* **Instruction:** Extract the final payable amount.
* **Missing Value:** If missing, return "0".
**Output Format:**
Return the extracted information in the following JSON format:
```json
{
"Supplier Company Name": "<supplier_company_name>",
"Supplier GSTIN": "<supplier_gstin>",
"Address": "<address>",
"Invoice No": "<invoice_no>",
"Invoice Date": "<invoice_date>",
"Supplier State Code": "<supplier_state_code>",
"Buyer Company Name": "<buyer_company_name>",
"Buyer State": "<buyer_state>",
"Buyer GST": "<buyer_gst>",
"Taxable Value": "<taxable_value>",
"Tax Rate": "<tax_rate>",
"CGST": "<cgst>",
"SGST": "<sgst>",
"IGST": "<igst>",
"Discount": "<discount>",
"Total Amount": "<total_amount>"
}
```
"""
try:
response = await model.generate_content_async([system_prompt,prompt, content_part])
raw_output = response.text
except Exception as e:
return {}
start_index = raw_output.find("{")
end_index = raw_output.rfind("}")
if start_index == -1 or end_index == -1:
return {}
json_content = raw_output[start_index:end_index + 1]
try:
parsed_data = json.loads(json_content)
except json.JSONDecodeError as e:
return {}
expected_fields = [
"Supplier Company Name", "Supplier GSTIN", "Address", "Invoice No", "Invoice Date",
"Supplier State Code","Buyer Company Name", "Buyer State", "Buyer GST", "Taxable Value",
"Tax Rate", "CGST", "SGST", "IGST", "Discount", "Total Amount"
]
for field in expected_fields:
if field not in parsed_data or parsed_data[field] is None:
parsed_data[field] = "-"
# Address Cleaning
if "Address" in parsed_data and parsed_data["Address"] != "-":
address_parts = parsed_data["Address"].split()
parsed_data["Address"] = " ".join(address_parts).replace(',',',').replace("\n", ", ").replace(",,", ", ")
# Rate Cleaning
if "Tax Rate" in parsed_data:
if not parsed_data["Tax Rate"].endswith("%"):
parsed_data["Tax Rate"] = "0%"
# Discount Cleaning
if "Discount" in parsed_data:
if parsed_data["Discount"] == "NILL" or parsed_data["Discount"] == "":
parsed_data["Discount"] = "0"
# Buyer State Validation
if "Buyer State" in parsed_data and parsed_data["Buyer State"] != "-":
buyer_state = parsed_data["Buyer State"].strip()
# Check if the Buyer State contains any digits
if any(char.isdigit() for char in buyer_state):
parsed_data["Buyer State"] = "-"
else:
valid_states = [
"Andhra Pradesh", "Arunachal Pradesh", "Assam", "Bihar", "Chhattisgarh", "Goa",
"Gujarat", "Haryana", "Himachal Pradesh", "Jharkhand", "Karnataka", "Kerala",
"Madhya Pradesh", "Maharashtra", "Manipur", "Meghalaya", "Mizoram", "Nagaland",
"Odisha", "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana", "Tripura",
"Uttar Pradesh", "Uttarakhand", "West Bengal"
]
# Ensure only valid state names are allowed
if buyer_state not in valid_states:
parsed_data["Buyer State"] = "-"
# Supplier Company Name Cleaning
if "Supplier Company Name" in parsed_data and parsed_data["Supplier Company Name"]:
if parsed_data.get("Supplier Company Name", "").strip().lower().startswith("curefoods"):
parsed_data["Supplier Company Name"] = "-"
parsed_data["Supplier Company Name"] = parsed_data["Supplier Company Name"].replace("\n", " ").strip()
if "Buyer GST" in parsed_data and parsed_data["Buyer GST"] != "-":
buyer_gst = parsed_data["Buyer GST"].strip()
buyer_gst = buyer_gst.replace("GSTIN :", "").replace("GSTIN:", "").replace("GST No.", "").strip() # Remove common prefixes
if not buyer_gst.isalnum() or len(buyer_gst) != 15: # Basic GSTIN validation (alphanumeric and length 15 - you might need more robust validation)
parsed_data["Buyer GST"] = "-" # If it's not a valid GSTIN format, return "-"
else:
parsed_data["Buyer GST"] = buyer_gst
return parsed_data
The text was updated successfully, but these errors were encountered:
The text was updated successfully, but these errors were encountered: